B
    #=6\                 @   sj   d Z ddlZddlmZ ddlmZ dd Zdd Zdd
dZ	dd Z
dd Zdd Zdd Zdd ZdS )aJ  muji.py does multi-gpu training for caffe2 with no need to change the c++
side code. Everything is defined on the computation graph level.

We support the following use cases:
  - 2 gpus, where peer access is enabled between them.
  - 4 gpus, where peer access are enabled between all of them.
  - 4 gpus, where peer access are enabled in two groups,
    between {1, 2} and {3, 4}
  - 8 gpus, where peer access are enabled in two groups,
    between {1, 2, 3, 4} and {5, 6, 7, 8}.
If above cases are not satisfied, a fallback function which does not rely on
peer access will be called.
    N)
caffe2_pb2)	workspacec             C   s   t  }tj|_| |_|S )zVA utility function that returns a device option protobuf of the
  specified gpu id.
  )r   DeviceOptionr   ZGpuDeviceTypedevice_typeZ	device_id)Zgpu_iddevice_option r   5/tmp/pip-install-l3r2oljg/torch/caffe2/python/muji.pyOnGPU   s    r	   c              C   s   t  } t j| _| S )N)r   r   ZCPUr   )r   r   r   r   OnCPU!   s    r
   _reducedc             C   s  |dkrt tt|}t|t|kr@tdt|t|f t }t|dkr|jd dkrt|ddddf rt	| |||S t|dkr|jd dkrt|ddddf rt
| |||S t|dkr0|jd dkr0t|ddddf r0t|ddddf r0t| |||S t|dkrx|jd dkrxt|ddddf rxt| |||S t| |||S dS )zThe general Allreduce interface that reroutes the function calls.
    CPUs and AMD GPUs are not supported because
    GetGpuPeerAccessPattern is called to get gpu peer access pattern.
  Nz6gpu_indices length and blobs length mismatch: %d vs %d   r         )listrangelenRuntimeErrorr   ZGetGpuPeerAccessPatternshapenpall
Allreduce2
Allreduce4Allreduce4Group2
Allreduce8AllreduceFallback)netblobsreduced_affixgpu_indicespatternr   r   r   	Allreduce'   s     44V:r    c       
      C   sL   |\}}|\}}| j ||g|| t|d}|jg || t|d}	||	fS )zUAllreduce for 2 gpus.

  Algorithm: 0r <- 0 + 1, 1r <- 0r, where r means "reduced"
  )r   )Addr	   Copy)
r   r   r   r   abgpu_agpu_b	a_reduced	b_reducedr   r   r   r   @   s    r   c             C   s   |\}}}}|\}}	}
}| j ||gt|| t|d}| j ||gt|| t|
d}|j ||t|d}|jg |t|
d}|jg t|| t|	d}|jg t|| t|d}||||fS )zAllreduce for 4 gpus.

  Algorithm: 2 level reduction.
      0r <- 0 + 1, 2r <- 2 + 3
      0r <- 0r + 2r
      2r <- 0r,
      1r <- 0r, 3r <- 2r
  )r   )r!   strr	   r"   )r   r   r   r   r#   r$   cdr%   r&   gpu_cgpu_dr'   	c_reducedr(   	d_reducedr   r   r   r   P   s*    	



r   c             C   s   |\}}}}|\}}	}
}| j ||gt|| t|d}| j ||gt|| t|
d}|jg t|d t|d}|j ||t|d}|jg |t|
d}|jg t|| t|	d}|jg t|| t|d}||||fS )zAllreduce for 4 gpus where peer access are enabled in {0,1} and {2,3}

  Algorithm: 2 level reduction.
      0r <- 0 + 1, 2r <- 2 + 3
      0r <- 0r + 2r
      2r <- 0r,
      1r <- 0r, 3r <- 2r
  )r   _copy)r!   r)   r	   r"   )r   r   r   r   r#   r$   r*   r+   r%   r&   r,   r-   r'   r.   Zc_reduced_copyr(   r/   r   r   r   r   x   s2    	




r   c             C   sz  dgd }x@dD ]8}| j || ||d  g|| | t|| d||< qW xDdD ]<}| j || ||d  gt|| | t|| d||< qRW |d jg t|d d	 t|d
 d}|d
 j ||d
 t|d
 d|d
< |d
 jg |d t|d d|d< x6dD ].}||d  jg || t|| d||< qW x:dD ]2}||d  jg || | t|| d||< q@W |S )zAllreduce for 8 gpus.

  Algorithm: 3 level reduction.
      0r <- 0 + 1, 2r <- 2 + 3, 4r <- 4 + 5, 6r <- 6 + 7
      0r <- 0r + 2r, 4r <- 4r + 6r
      0r <- 0r + 4r
      4r <- 0r
      2r <- 0r, 6r <- 4r
      1r <- 0r, 3r <- 2r, 5r <- 4r, 7r <- 6r
  Nr   )r   r   r         )r   )r   r   r   r   r0   r   )r   r1   )r2            )r!   r	   r)   r"   )r   r   r   r   reducediZreduced_4_copyr   r   r   r      sD    






r   c             C   s   dgt | }|dkr@| j|d |d | t|d d|d< n|d |d< |d d }xZtdt |D ]H}| j|| |t|d d}| j||d g|d t|d d|d< qhW x>tdt |D ],}| j|d || | t|| d||< qW |S )zA fallback option for Allreduce with no assumption on p2p.

  Algorithm: a flat operation on gpu 0
      0r <- 0
      0r <- 0r + i for i in gpu_indices[1:]
      ir <- 0r for i in gpu_indices[1:]
  N r   )r   Z
_temp_copyr2   )r   r"   r	   r   r!   )r   r   r   r   r6   Z	temp_namer7   tempr   r   r   r      s.    


r   )r   N)__doc__Znumpyr   Zcaffe2.protor   Zcaffe2.pythonr   r	   r
   r    r   r   r   r   r   r   r   r   r   <module>   s   

(.<