B
    n\|                 @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlm  mZ ddlmZmZmZmZmZmZm Z m!Z! ddlm"Z"m#Z#m$Z$m%Z%m&Z& G d	d
 d
ej'Z(dS )    N)
DataLoader)Thread)colored   )BatchOfflineDatasetSortedTraceSamplerSortedTraceBatchSampler"SortedTraceBatchSamplerDistributedEmbeddingFeedForwardEmbeddingCNN2D5CEmbeddingCNN3D5C   )__version__util	OptimizerObserveEmbeddingLRSchedulerc                   s   e Zd Zi df fdd	Zdd Zd6ddZd7d	d
Zdd Zdd Zd8ddZ	dd Z
dd Zedd Zd9 fdd	Zd:ddZdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zddejejd.d/d0dd1dd2dd3fd4d5Z  ZS );InferenceNetwork c                sX  t    || _t | _d | _d| _d| _|| _	d | _
d | _i | _d | _d| _d| _d| _d | _td| _d | _td| _g | _g | _g | _g | _g | _g | _td| _td| _td| _ td| _!g | _"g | _#g | _$g | _%td| _&td| _'g | _(g | _)t* | _+d| _,d| _-t./d| _0|| _1d | _2d | _3d | _4d | _5d | _6d | _7d S )NFr   infg        cpu)8super__init___modelnn
ModuleDict_layers_observe_embedding_layers_observe_embedding_final_layers_pre_generated_layers_initialized_observe_embeddings_observe_embedding_dim_infer_observe_infer_observe_embedding
_optimizer_total_train_seconds_total_train_traces_total_train_iterations_loss_initialfloat	_loss_min	_loss_max_loss_previous_history_train_loss_history_train_loss_trace_history_valid_loss_history_valid_loss_trace_history_num_paramsZ_history_num_params_tracer   	to_tensor_distributed_train_loss_distributed_valid_loss_distributed_train_loss_min_distributed_valid_loss_min_distributed_history_train_loss%_distributed_history_train_loss_trace_distributed_history_valid_loss%_distributed_history_valid_loss_trace _distributed_filtered_train_loss _distributed_filtered_valid_loss(_distributed_history_filtered_train_loss(_distributed_history_filtered_valid_lossget_time_str	_modified_updates_on_cudatorchdevice_deviceZ_network_type_optimizer_type_learning_rate	_momentum_batch_size_distributed_backend_distributed_world_size)selfmodelobserve_embeddingsnetwork_type)	__class__ ^/global/project/projectdirs/dasrepo/etalumis/pyprob_distributed/pyprob/nn/inference_network.pyr      s^    






zInferenceNetwork.__init__c             C   s  t |dkrtdd}xB| D ]4\}}|j| }d|krPt|d }n
|j }d|krtt|d g}ntd	| tdg}d|kr|d }	ntd	| t
j}	|	t
jkrd	|kr|d	 }
ntd
	| d}
t|||
d}nB|	t
jkrt||d}n(|	t
jkr&t||d}ntd	|	|jtjd || j|< |t|7 }q$W || _td	| j t| j| jdd| _| jjtjd d S )Nr   zIAt least one observe embedding is needed to initialize inference network.reshapedimzBObservable {}: embedding dim not specified, using the default 256.   	embeddingzNObservable {}: observe embedding not specified, using the default FEEDFORWARD.depthzBObservable {}: embedding depth not specified, using the default 2.r   )input_shapeoutput_shape
num_layers)rY   rZ   zUnknown embedding: {})rE   zObserve embedding dimension: {})len
ValueErroritemsnamed_variablesrD   Sizevaluesizeprintformatr   FEEDFORWARDr   CNN2D5Cr   CNN3D5Cr   tor   rF   r   prodr"   r   )rM   rO   example_traceZobserve_embedding_total_dimnamera   variablerY   rZ   rW   rX   layerrR   rR   rS   _init_layers_observe_embeddingL   sD    





z/InferenceNetwork._init_layers_observe_embeddingNc                sj   g }xH| j  D ]:\ }t fdd|D t|d}||| qW tj|dd}| |}|S )Nc                s   g | ]}t |j  jqS rR   )r   r3   r_   ra   ).0trace)rk   rR   rS   
<listcomp>|   s    z3InferenceNetwork._embed_observe.<locals>.<listcomp>r   )rU   )	r   r^   rD   stackviewr\   appendcatr   )rM   tracesrW   rm   valuesrR   )rk   rS   _embed_observey   s    $
zInferenceNetwork._embed_observec             C   sd   || _ g }x:| j D ],\}}t|| dd}||| qW tj|dd}| 	|| _
d S )Nr   rr   )rU   )r#   r   r^   r   r3   rt   ru   rD   rv   r   r$   )rM   observerW   rk   rm   ra   rR   rR   rS   _infer_init   s    zInferenceNetwork._infer_initc             C   s
   t  d S )N)NotImplementedError)rM   rR   rR   rS   _init_layers   s    zInferenceNetwork._init_layersc             C   s
   t  d S )N)r|   )rM   batchrR   rR   rS   
_polymorph   s    zInferenceNetwork._polymorphc             C   s
   t  d S )N)r|   )rM   rl   Zprevious_variableproposal_min_train_iterationsrR   rR   rS   _infer_step   s    zInferenceNetwork._infer_stepc             C   s
   t  d S )N)r|   )rM   r~   rR   rR   rS   _loss   s    zInferenceNetwork._lossc                s|   t  | _|  jd7  _i  t d< tj d< t|  d< d  d _d  d _ fdd}t	|d}|
  |  d S )Nr   pyprob_versiontorch_versioninference_networkc                 sb   t jtt d} tj| d}t	 | t
jddd}|j|dd |  t|  d S )N)suffixpyprob_inference_networkzw:gzr   )compresslevel)arcname)tempfilemkdtempstruuiduuid4ospathjoinrD   savetarfileopenaddcloseshutilrmtree)tmp_dirtmp_file_nametar)data	file_namerR   rS   thread_save   s    z+InferenceNetwork._save.<locals>.thread_save)target)r   r@   rA   rB   r   rD   copyr   r%   r   startr   )rM   r   r   trR   )r   r   rS   _save   s    




zInferenceNetwork._savec             C   s  ytt | d}tjtt d}tj	|d}|
d| |  tjrVt|}ntj|dd d}t| W n   tdY nX |d tkrttd	|d td
dgd |d tjkrttd|d tjd
dgd |d }tjrL|jr.|jtjkrJttd|jtjd
dgd nttdtjd
dgd n$|jrpttd|jd
dgd |jtjd |S )Nzr:gz)r   r   c             S   s   | S )NrR   )storagelocrR   rR   rS   <lambda>   s    z(InferenceNetwork._load.<locals>.<lambda>)map_locationzCannot load inference network.r   zKWarning: different pyprob versions (loaded network: {}, current system: {})redbold)attrsr   zLWarning: different PyTorch versions (loaded network: {}, current system: {})r   z=Warning: loading CUDA (device {}) network to CUDA (device {})z0Warning: loading CPU network to CUDA (device {})z0Warning: loading CUDA (device {}) network to CPU)rE   )r   r   r   r   r   r   r   r   r   r   extractr   r   _cuda_enabledrD   loadr   r   RuntimeErrorr   rc   r   rd   rC   rF   rh   )r   r   r   tmp_filer   retrR   rR   rS   _load   s4     ""zInferenceNetwork._loadc                s.   || _ dt|k| _t j||d|i d S )NcudarE   )rF   r   rC   r   rh   )rM   rE   argskwargs)rQ   rR   rS   rh      s    zInferenceNetwork.to@   c       
      C   s   | j s*| j| j|dd |   d| _ d| _t||dddd d}tdt	|d d}x`t
|D ]T\}}|t	|7 }| |}t| |rf|d k	rfd	|}	td
dd | |	 qfW td d S )Nr   )rj   Tc             S   s   t | S )N)r   )xrR   rR   rS   r      s    z7InferenceNetwork._pre_generate_layers.<locals>.<lambda>)
batch_sizeshufflenum_workers
collate_fnzLayer pre-generation...Tracesz!{}_00000000_pre_generated.networkzSaving to disk...  )endzLayer pre-generation complete)r    rn   r!   __getitem__r}   r   r   r   progress_bar_initr\   	enumerater   progress_bar_updaterd   rc   r   progress_bar_end)
rM   datasetr   save_file_name_prefix
dataloaderii_batchr~   layers_changedr   rR   rR   rS   _pre_generate_layers   s"    


z%InferenceNetwork._pre_generate_layersc             C   s$   x|   D ]}t|jd q
W dS )z) broadcast rank 0 parameter to all ranks r   N)
parametersdist	broadcastr   )rM   paramrR   rR   rS   _distributed_sync_parameters   s    z-InferenceNetwork._distributed_sync_parametersc             C   sN   d}t |  }|d jj}t|j}x|D ]}||j 7 }q.W |||fS )Nr   )listr   r   dtypenpitemsizenumel)rM   Ztotal_itemsZall_parameterselement_typeelement_sizepararR   rR   rS   _count_items   s    
zInferenceNetwork._count_itemsc       	      C   sd   d}d}xV|   D ]J}|j}|j}| }tj| j|||d|}|d |d< ||| 7 }qW dS )zI pack all items of data into a global array, this is typically for rank 0r   )r   countoffset.N)r   r   shaper   r   
frombuffergpararT   )	rM   r   r   r   r   r   idata
data_shapert   rR   rR   rS   
_pack_data   s    zInferenceNetwork._pack_datac       
      C   sd   d}d}xV|D ]N}|j }|j}| }tj| j|||d|}	t|		 |_ ||| 7 }qW dS )zF unpack global array into items of data, this is typically for rank 1+r   )r   r   r   N)
r   r   r   r   r   r   rT   rD   	as_tensorr   )
rM   r   r   r   r   r   r   r   r   rt   rR   rR   rS   _unpack_data  s    
zInferenceNetwork._unpack_datac             C   s   t d d S )NzNot implemented)rc   )rM   rR   rR   rS    _distributed_sync_parameters_mpi  s    z1InferenceNetwork._distributed_sync_parameters_mpic             C   s   t dd |  D }t|g g }x\t|  D ]L\}}|jdk	rX||jj q6|| r6t t	
|j|_||jj q6W t| x|D ]}|t| }qW dS )z! all_reduce grads from all ranks c             S   s   g | ]}|j d k	rdndqS )Nr   r   )grad)ro   prR   rR   rS   rq     s    z;InferenceNetwork._distributed_sync_grad.<locals>.<listcomp>N)r   r3   r   r   
all_reducer   r   ru   r   rD   
zeros_liker*   )rM   
world_sizettmapglr   r   ZlirR   rR   rS   _distributed_sync_grad  s    


z'InferenceNetwork._distributed_sync_gradc             C   s   t t|| _t| j |  jt|  _| jt| j | j| j	 | jd| d  }t
|t| | _| j| j t| j| jk rt| j| _d S )Nr   )r   r3   r*   r4   r   r   r8   ru   r9   r'   sumr\   r<   r>   r6   )rM   lossr   loss_moving_average_window_sizeZrecent_lossesrR   rR   rS   _distributed_update_train_loss'  s    z/InferenceNetwork._distributed_update_train_lossc             C   s   t t|| _t| j |  jt|  _| jt| j | j| j	 t| j| j
k rjt| j| _
tjdkrttd| j| j
ddgd d S )Nr   zNDistributed mean valid. loss across ranks : {:+.2e}, min. valid. loss: {:+.2e}yellowr   )r   )r   r3   r*   r5   r   r   r:   ru   r;   r'   r7   get_rankrc   r   rd   )rM   r   r   rR   rR   rS   _distributed_update_valid_loss5  s    
z/InferenceNetwork._distributed_update_valid_lossc             C   s   d}d}d}x| j jD ]}tdd |d D }xt|d D ]\}}|| sRq@t|j}	|jj}
t|
}|	dkr|dkr||	 | }n|}|dkr|}nt	||d	 |d	  }||
 }||j_q@W qW | j 
  t|}|S )
NclipgMb`?gMb?c             S   s   g | ]}|j d k	rdndqS )Nr   r   )r   )ro   r   rR   rR   rS   rq   N  s    z>InferenceNetwork.larc_optimizer_train_step.<locals>.<listcomp>paramsg        scalelr)r%   param_groupsr   r3   r   rD   normr   r   minstepr*   )rM   r   Z	LARC_modeZLARC_etaZLARC_epsilongroupr   r   r   weight_normgZ	grad_normZlarc_local_lrZeffective_lrZg_scaledrR   rR   rS   larc_optimizer_train_stepE  s,    

z*InferenceNetwork.larc_optimizer_train_stepg-C6?g?gh㈵>iX  i'  r   c       <         s  d}| j s.| j| j|dd |   d| _ |d krBd}d}ntj|d t }t }t	
||d |dkr:ttddd	gd
 ttd|dd	gd
 ttd|dd	gd
 ttd|| |dd	gd
 ttd|| |dd	gd
 ttdt|dd	gd
 ttdt|dd	gd
 || _|| _g | _g | _g | _g | _t	d| _t	d| _td| _td| _t	d| _t	d| _g | _g | _|| _|| _ || | _!|	| _"| #  | j$}t%% }t%% }t%% }|d krt&d|d }|d kr|}| d }d}d}d}| j!}d}d}d} d}!td d}"d}#d}$t%% | }%t'|t(rt)|t*||||dd|dd d}&|&}'nt)||ddd d}&|&}'|d k	rt)|t*||||dd|dd d}(| j+rd})| j,d ks|)rx|t-j.ks|t-j/kr,t0j1| 2 || |
d | _,nL|t-j3ksD|t-j4krdt0j3| 2 || |	d|
d!| _,ntd"| t5  d |t6j7krt8j9| j,d#d$d%}*n|t6j:krt8j;| j,d#d&gd$d'}*nx|t6j<kr fd(d}+t8j=| j,|+d)d*}*nL|t6j>kr fd+d},t8j=| j,|,d)d*}*n |t6j?kr8t8j@| j,dd,d)d-}*xn|!s|d7 }|*A  |dkr`|&n|'}-x<tB|-D ].\}.}/|dkr|| dkr| C  | j+rd})n
| D|/})| j,E  | F|/\}0}1|0sttd.|1d/d	gd
 n|1G  |dkr
| H| |t-j/t-j4gkr0| j,A  t|1}1n
| I|1}1| jJd krR|1| _J|1| _Kd0| jJ}2|1| jLk r|1| _Ltd0|1d1d	gd
}3td0| jLd1d	gd
}#t%% }tt	Mdd1d	gd
}$n|1| jKkr|1| _Ktd0|1d/d	gd
}3nd|1| jNk rtd0|1d1}3n(|1| jNkr$td0|1d/}3n
d0|1}3d0| jL}#t	Mt%% | }$|1| _N|  jOd7  _O| |/jP7 } |  jQ|/jP| 7  _Qd2d3| jQ}4d4d3|}5|t%% |  | _$t	M| j$}6d5tR|/jP| t%% |  }7t%% }|d k	r | jQ|kr d}!| jST|1 | jUT| jQ |d k	rtd6 | | |krtd7d8d9 d}8tVW 4 x,tB|(D ] \}.}/| F|/\}9}:|8|:7 }8q^W W d Q R X t|8tX| }8| jYT|8 | jZT| jQ | d }|dkr| [|1|| td0| jd}3td0| jd}#| \|8| |dkrB| [|1|| td0| jd}3td0| jd}#d:|6|5|4|2|#|3|$|7};t&tX|;|"}"|dkrt|;]|"d8d9 t^j_`  |!rP |d7 }qpW q<W d S );Ni'  r   )rj   Tr   )backendz Distributed synchronous trainingr   r   )r   zDistributed backend       : {}zDistributed world size    : {}z6Distributed minibatch size: {} (global), {} (per node)z:Distributed initial learning rate : {} (global), {} (base)zDistributed optimizer     : {}z/Distributed learning rate scheduling method: {}g        r   d   i  
   Fz]Train. time | Epoch| Trace     | Init. loss| Min. loss | Curr. loss| T.since min | Traces/secr   )r   num_replicasrankr   c             S   s   t | S )N)r   )r   rR   rR   rS   r     s    z+InferenceNetwork.optimize.<locals>.<lambda>)batch_samplerr   r   c             S   s   t | S )N)r   )r   rR   rR   rS   r     s    )r   r   r   c             S   s   t | S )N)r   )r   rR   rR   rS   r     s    )r   weight_decay)r   momentumnesterovr  zUnknown optimizer type: {}   g?)	step_sizegammaP   )
milestonesr	  c                s   dt |    d S )Nr   r   )r*   )epoch)	max_epochrR   rS   r     s    rr   )	lr_lambda
last_epochc                s   dt |    S )Nr   )r*   )r  )r  rR   rS   r     s    g-C6?)T_maxeta_minr  z-Cannot compute loss, skipping batch. Loss: {}r   z{:+.2e}greenz{:9}z{:,}z{:4}z{:,.1f}zdataset_valid is not NonezComputing validation loss...  r   )r   z%{} | {} | {} | {} | {} | {} | {} | {})ar    rn   r!   r   r}   r   init_process_groupget_world_sizer   r   init_distributed_printrc   r   rd   r   rK   rL   r8   r9   r:   r;   r3   r4   r5   r*   r6   r7   r<   r=   r>   r?   rG   rJ   rH   rI   trainr&   timemax
isinstancer   r   r
   r   r%   r   ADAM	LARC_ADAMoptimAdamr   SGDLARC_SGDquitr   STEPlr_schedulerStepLRMULTI_STEPSMultiStepLRPOLY_2LambdaLRPOLY_1COSINEANNEALINGCosineAnnealingLRr   r   r   r   	zero_gradr   backwardr   r   r)   r,   r+   days_hours_mins_secs_strr-   r(   rb   r'   intr.   ru   r/   rD   no_gradr\   r0   r1   r   r   ljustsysstdoutflush)<rM   
num_tracesr   dataset_validr   valid_everyoptimizer_typeLR_schedule_methodlearning_rater  r  r   save_every_secdistributed_backendZdistributed_params_sync_everyZdistributed_loss_update_everydataloader_offline_num_workersr   r   Zdistributed_world_sizeZdistributed_rankZprev_total_train_seconds
time_startZtime_loss_minZtime_last_batchZlast_validation_tracer   ZEnable_MA_filterZper_rank_printbase_lrr  	iterationrp   stopZmax_print_line_lenZloss_min_strZtime_since_loss_min_strZlast_auto_save_timeZdataloader_epoch_oneZdataloader_epoch_allZdataloader_validr   Z	schedulerZlambda1Zlambda2r   r   r~   successr   Zloss_initial_strZloss_strZtotal_train_traces_strZ	epoch_strZtotal_training_seconds_strZtraces_per_second_strZ
valid_loss_vZ
print_linerR   )r  rS   optimizel  s`   
  





"
" 









 




	

zInferenceNetwork.optimize)N)N)NN)N)r   N)__name__
__module____qualname__r   rn   ry   r{   r}   r   r   r   r   staticmethodr   rh   r   r   r   r   r   r   r   r   r   r   r   r  r   r&  rD  __classcell__rR   rR   )rQ   rS   r      s,   3-
	
	
"
'r   ))numpyrD   torch.nnr   torch.optimr  torch.distributeddistributedr   torch.utils.datar   r1  r  r   r   r   r   r   r   	threadingr   	termcolorr   Ztorch.optim.lr_schedulerr"  r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   Moduler   rR   rR   rR   rS   <module>   s&   (