o
    ?e                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 e
 Zdadd	 Zed
dd Zdd Zdd Zdd Zdd Zeddd Zdd Zeddd Zdd Zdd  Zd!d" Zd%d#d$ZdS )&z@Ops for GPU collective operations implemented using NVIDIA nccl.    N)context)def_functiondevice)ops)gen_nccl_opsc                 C   
   t d| S )a  Returns a list of tensors with the all-reduce sum across `tensors`.

  The computation is done with an all-reduce operation, so if only some of the
  returned tensors are evaluated then the computation will hang.

  Args:
    tensors: The input tensors across which to sum; must be assigned
      to GPU devices.

  Returns:
    List of tensors, each with the sum of the input tensors, where tensor i has
    the same device as `tensors[i]`.
  sum_apply_all_reducetensors r   _/home/www/facesmatcher.com/pyenv/lib/python3.10/site-packages/tensorflow/python/ops/nccl_ops.pyall_sum      
r   ZNcclAllReducec                 C   s~   |  ddkrtdt|| jd |  d}|  dd }t| j tj|d||d	W  d
   S 1 s8w   Y  d
S )a)  The gradients for `all_sum`.

  Args:
    op: The `all_sum` `Operation` that we are differentiating.
    grad: Gradient with respect to the output of the `all_sum` op.

  Returns:
    The gradient with respect to the output of `all_sum`.

  Raises:
    LookupError: If `reduction` is not `sum`.
  	reduction   sumANo gradient defined for NcclAllReduce except for reduction="sum".expectednum_devicesshared_names   _gradr	   inputr   r   r   N)get_attrLookupError_check_devicer   r   r   nccl_all_reduce)opgradr   r   r   r   r   _all_sum_grad.   s   
$r!   c                 C   r   )a  Returns a list of tensors with the all-reduce product across `tensors`.

  The computation is done with an all-reduce operation, so if only some of the
  returned tensors are evaluated then the computation will hang.

  Args:
    tensors: The input tensors across which to multiply; must be assigned
      to GPU devices.

  Returns:
    List of tensors, each with the product of the input tensors, where tensor i
    has the same device as `tensors[i]`.
  prodr
   r   r   r   r   all_prodL   r   r#   c                 C   r   )a  Returns a list of tensors with the all-reduce min across `tensors`.

  The computation is done with an all-reduce operation, so if only some of the
  returned tensors are evaluated then the computation will hang.

  Args:
    tensors: The input tensors across which to reduce; must be assigned
      to GPU devices.

  Returns:
    List of tensors, each with the minimum of the input tensors, where tensor i
    has the same device as `tensors[i]`.
  minr
   r   r   r   r   all_min]   r   r%   c                 C   r   )a  Returns a list of tensors with the all-reduce max across `tensors`.

  The computation is done with an all-reduce operation, so if only some of the
  returned tensors are evaluated then the computation will hang.

  Args:
    tensors: The input tensors across which to reduce; must be assigned
      to GPU devices.

  Returns:
    List of tensors, each with the maximum of the input tensors, where tensor i
    has the same device as `tensors[i]`.
  maxr
   r   r   r   r   all_maxn   r   r'   c                 C   r   )a  Returns a tensor with the reduce sum across `tensors`.

  The computation is done with a reduce operation, so only one tensor is
  returned.

  Args:
    tensors: The input tensors across which to sum; must be assigned
      to GPU devices.

  Returns:
    A tensor containing the sum of the input tensors.

  Raises:
    LookupError: If context is not currently using a GPU device.
  r	   )_apply_reducer   r   r   r   
reduce_sum   s   
r)   Z
NcclReducec                 C   sp   |  ddkrtdt|| jd t| j tj||jd}W d   n1 s+w   Y  |gt| j	 S )a\  The gradients for input `Operation` of `reduce_sum`.

  Args:
    op: The `sum send` `Operation` that we are differentiating.
    grad: Gradient with respect to the output of the `reduce_sum` op.

  Returns:
    The gradient with respect to the input of `reduce_sum` op.

  Raises:
    LookupError: If the reduction attribute of op is not `sum`.
  r   r   r   r   r   shapeN)
r   r   r   r   r   r   nccl_broadcastr+   leninputs)r   r    resultr   r   r   _reduce_sum_grad   s   r0   c                 C   sH   t |  t| j tj| | jdW  d   S 1 sw   Y  dS )a	  Returns a tensor that can be efficiently transferred to other devices.

  Args:
    tensor: The tensor to send; must be assigned to a GPU device.

  Returns:
    A tensor with the value of `src_tensor`, which can be used as input to
    ops on other GPU devices.
  r*   N)r   r   r   r   r,   r+   )tensorr   r   r   	broadcast   s   
$r2   ZNcclBroadcastc                 C   sb   dd |j jD }|D ]}t| qt| j tj|ddW  d   S 1 s*w   Y  dS )a-  The gradients for input `Operation` of `broadcast`.

  Args:
    op: The `broadcast send` `Operation` that we are differentiating.
    accumulated_grad: Accumulated gradients with respect to the output of the
      `broadcast` op.

  Returns:
    Gradients with respect to the input of `broadcast`.
  c                 S   s   g | ]}|qS r   r   .0tr   r   r   
<listcomp>   s    z#_broadcast_grad.<locals>.<listcomp>r	   r   r   N)r   r.   r   r   r   r   nccl_reduce)r   Zaccumulated_gradZgradsr5   r   r   r   _broadcast_grad   s   
$r9   c                    s<   st dt  fdd}t rt| S | S )z$Helper function for all_* functions.z-Must pass >0 tensors to all reduce operationsc               
      sb   g } D ]*}t | t|j | tj| td W d   n1 s)w   Y  q| S )zCall nccl allreduce.r   N)r   r   r   appendr   r   r-   )resr5   r   r   r   r   r   _all_reduce   s   z&_apply_all_reduce.<locals>._all_reduce)
ValueError_get_shared_namer   Zexecuting_eagerlyr   function)r   r   r=   r   r<   r   r      s   r   c                    s`   |st d|D ]}t| qtj|| d zt fdd|D  W  S  ty/   t dw )z'Helper function for reduce_* functions.z)Must pass >0 tensors to reduce operationsr7   c                 3   s     | ]}|j  j kr|V  qd S Nr   r3   r/   r   r   	<genexpr>   s    z _apply_reduce.<locals>.<genexpr>z3One input tensor must be assigned to current device)r>   r   r   r8   nextStopIteration)r   r   r5   r   rB   r   r(      s   
r(   c                  C   s>   t  t} td7 aW d    d|  S 1 sw   Y  d|  S )N   zc%s)_module_lock_shared_name_counter)valr   r   r   r?      s   

r?   c                 C   sP   t | j std|  d|r$|| j kr&td| d| j  d|  dd S d S )NzDevice assignment for tensor=z! required for nccl collective opszExpected device z, got z for tensor=.)r   canonical_namer>   )r1   r   r   r   r   r     s   
r   rA   )__doc__	threadingZtensorflow.python.eagerr   r   Ztensorflow.python.frameworkr   r   Ztensorflow.python.opsr   LockrG   rH   r   ZRegisterGradientr!   r#   r%   r'   r)   r0   r2   r9   r   r(   r?   r   r   r   r   r   <module>   s2   


	