B
    #g\]                 @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ ddlmZmZmZmZmZmZmZ ddlmZmZm Z m!Z! G d	d
 d
ej"Z#dS )    N)
DataLoader)Thread)colored   )BatchOfflineDatasetSortedTraceSamplerSortedTraceBatchSamplerEmbeddingFeedForwardEmbeddingCNN2D5CEmbeddingCNN3D5C   )__version__util	OptimizerObserveEmbeddingc                   s   e Zd Zi df fdd	Zdd Zd/ddZd0d	d
Zdd Zdd Zd1ddZ	dd Z
dd Zedd Zd2 fdd	Zd3ddZdd Zdd Zd d! Zd"d# Zd$d% Zddejd&d'd(dd)dd*dd+d,fd-d.Z  ZS )4InferenceNetwork c                s4  t    || _t | _d | _d| _d| _|| _	d | _
d | _i | _d | _d| _d| _d| _d | _td| _d | _td| _g | _g | _g | _g | _g | _g | _td| _td| _td| _ td| _!g | _"g | _#g | _$g | _%t& | _'d| _(d| _)t*+d| _,|| _-d | _.d | _/d | _0d | _1d | _2d | _3d S )NFr   infg        cpu)4super__init___modelnn
ModuleDict_layers_observe_embedding_layers_observe_embedding_final_layers_pre_generated_layers_initialized_observe_embeddings_observe_embedding_dim_infer_observe_infer_observe_embedding
_optimizer_total_train_seconds_total_train_traces_total_train_iterations_loss_initialfloat	_loss_min	_loss_max_loss_previous_history_train_loss_history_train_loss_trace_history_valid_loss_history_valid_loss_trace_history_num_paramsZ_history_num_params_tracer   	to_tensor_distributed_train_loss_distributed_valid_loss_distributed_train_loss_min_distributed_valid_loss_min_distributed_history_train_loss%_distributed_history_train_loss_trace_distributed_history_valid_loss%_distributed_history_valid_loss_traceget_time_str	_modified_updates_on_cudatorchdevice_deviceZ_network_type_optimizer_type_learning_rate	_momentum_batch_size_distributed_backend_distributed_world_size)selfmodelobserve_embeddingsnetwork_type)	__class__ Y/global/project/projectdirs/dasrepo/etalumis/pyprob_master/pyprob/nn/inference_network.pyr      sV    






zInferenceNetwork.__init__c             C   s  t |dkrtdd}xB| D ]4\}}|j| }d|krPt|d }n
|j }d|krtt|d g}ntd	| tdg}d|kr|d }	ntd	| t
j}	|	t
jkrd	|kr|d	 }
ntd
	| d}
t|||
d}nB|	t
jkrt||d}n(|	t
jkr&t||d}ntd	|	|jtjd || j|< |t|7 }q$W || _td	| j t| j| jdd| _| jjtjd d S )Nr   zIAt least one observe embedding is needed to initialize inference network.reshapedimzBObservable {}: embedding dim not specified, using the default 256.   	embeddingzNObservable {}: observe embedding not specified, using the default FEEDFORWARD.depthzBObservable {}: embedding depth not specified, using the default 2.r   )input_shapeoutput_shape
num_layers)rS   rT   zUnknown embedding: {})r?   zObserve embedding dimension: {})len
ValueErroritemsnamed_variablesr>   Sizevaluesizeprintformatr   FEEDFORWARDr
   CNN2D5Cr   CNN3D5Cr   tor   r@   r   prodr    r   )rG   rI   example_traceZobserve_embedding_total_dimnamer[   variablerS   rT   rQ   rR   layerrL   rL   rM   _init_layers_observe_embeddingF   sD    





z/InferenceNetwork._init_layers_observe_embeddingNc                sj   g }xH| j  D ]:\ }t fdd|D t|d}||| qW tj|dd}| |}|S )Nc                s   g | ]}t |j  jqS rL   )r   r1   rY   r[   ).0trace)re   rL   rM   
<listcomp>v   s    z3InferenceNetwork._embed_observe.<locals>.<listcomp>r   )rO   )	r   rX   r>   stackviewrV   appendcatr   )rG   tracesrQ   rg   valuesrL   )re   rM   _embed_observes   s    $
zInferenceNetwork._embed_observec             C   sd   || _ g }x:| j D ],\}}t|| dd}||| qW tj|dd}| 	|| _
d S )Nr   rl   )rO   )r!   r   rX   r   r1   rn   ro   r>   rp   r   r"   )rG   observerQ   re   rg   r[   rL   rL   rM   _infer_init|   s    zInferenceNetwork._infer_initc             C   s
   t  d S )N)NotImplementedError)rG   rL   rL   rM   _init_layers   s    zInferenceNetwork._init_layersc             C   s
   t  d S )N)rv   )rG   batchrL   rL   rM   
_polymorph   s    zInferenceNetwork._polymorphc             C   s
   t  d S )N)rv   )rG   rf   Zprevious_variableproposal_min_train_iterationsrL   rL   rM   _infer_step   s    zInferenceNetwork._infer_stepc             C   s
   t  d S )N)rv   )rG   rx   rL   rL   rM   _loss   s    zInferenceNetwork._lossc                s|   t  | _|  jd7  _i  t d< tj d< t|  d< d  d _d  d _ fdd}t	|d}|
  |  d S )Nr   pyprob_versiontorch_versioninference_networkc                 sb   t jtt d} tj| d}t	 | t
jddd}|j|dd |  t|  d S )N)suffixpyprob_inference_networkzw:gzr   )compresslevel)arcname)tempfilemkdtempstruuiduuid4ospathjoinr>   savetarfileopenaddcloseshutilrmtree)tmp_dirtmp_file_nametar)data	file_namerL   rM   thread_save   s    z+InferenceNetwork._save.<locals>.thread_save)target)r   r:   r;   r<   r   r>   copyr   r#   r   startr   )rG   r   r   trL   )r   r   rM   _save   s    




zInferenceNetwork._savec             C   s  ytt | d}tjtt d}tj	|d}|
d| |  tjrVt|}ntj|dd d}t| W n   tdY nX |d tkrttd	|d td
dgd |d tjkrttd|d tjd
dgd |d }tjrL|jr.|jtjkrJttd|jtjd
dgd nttdtjd
dgd n$|jrpttd|jd
dgd |jtjd |S )Nzr:gz)r   r   c             S   s   | S )NrL   )storagelocrL   rL   rM   <lambda>   s    z(InferenceNetwork._load.<locals>.<lambda>)map_locationzCannot load inference network.r}   zKWarning: different pyprob versions (loaded network: {}, current system: {})redbold)attrsr~   zLWarning: different PyTorch versions (loaded network: {}, current system: {})r   z=Warning: loading CUDA (device {}) network to CUDA (device {})z0Warning: loading CPU network to CUDA (device {})z0Warning: loading CUDA (device {}) network to CPU)r?   )r   r   r   r   r   r   r   r   r   r   extractr   r   _cuda_enabledr>   loadr   r   RuntimeErrorr   r]   r   r^   r=   r@   rb   )r   r   r   tmp_filer   retrL   rL   rM   _load   s4     ""zInferenceNetwork._loadc                s.   || _ dt|k| _t j||d|i d S )Ncudar?   )r@   r   r=   r   rb   )rG   r?   argskwargs)rK   rL   rM   rb      s    zInferenceNetwork.to@   c       
      C   s   | j s*| j| j|dd |   d| _ d| _t||dddd d}tdt	|d d}x`t
|D ]T\}}|t	|7 }| |}t| |rf|d k	rfd	|}	td
dd | |	 qfW td d S )Nr   )rd   Tc             S   s   t | S )N)r   )xrL   rL   rM   r      s    z7InferenceNetwork._pre_generate_layers.<locals>.<lambda>)
batch_sizeshufflenum_workers
collate_fnzLayer pre-generation...Tracesz!{}_00000000_pre_generated.networkzSaving to disk...  )endzLayer pre-generation complete)r   rh   r   __getitem__rw   r   r   r   progress_bar_initrV   	enumeratery   progress_bar_updater^   r]   r   progress_bar_end)
rG   datasetr   save_file_name_prefix
dataloaderii_batchrx   layers_changedr   rL   rL   rM   _pre_generate_layers   s"    


z%InferenceNetwork._pre_generate_layersc             C   s$   x|   D ]}t|jd q
W dS )z) broadcast rank 0 parameter to all ranks r   N)
parametersdist	broadcastr   )rG   paramrL   rL   rM   _distributed_sync_parameters   s    z-InferenceNetwork._distributed_sync_parametersc             C   s*   x$|   D ]}tt|j|_q
W d S )N)r   r   r1   r>   
zeros_liker   grad)rG   r   rL   rL   rM   _distributed_zero_grad   s    z'InferenceNetwork._distributed_zero_gradc          	   C   s`   xZ|   D ]N}y&t|jj |j jt|  _W q
 tk
rV   td|  Y q
X q
W dS )z! all_reduce grads from all ranks zNone for grad, with param.size=N)	r   r   
all_reducer   r   r(   AttributeErrorr]   r\   )rG   
world_sizer   rL   rL   rM   _distributed_sync_grad   s    z'InferenceNetwork._distributed_sync_gradc             C   s   t t|| _t| j |  jt|  _| jt| j | j| j	 t| j| j
k rjt| j| _
ttd| j| j
ddgd d S )NzNDistributed mean train. loss across ranks : {:+.2e}, min. train. loss: {:+.2e}yellowr   )r   )r   r1   r(   r2   r   r   r6   ro   r7   r%   r4   r]   r   r^   )rG   lossr   rL   rL   rM   _distributed_update_train_loss   s    z/InferenceNetwork._distributed_update_train_lossc             C   s   t t|| _t| j |  jt|  _| jt| j | j| j	 t| j| j
k rjt| j| _
ttd| j| j
ddgd d S )NzNDistributed mean valid. loss across ranks : {:+.2e}, min. valid. loss: {:+.2e}r   r   )r   )r   r1   r(   r3   r   r   r8   ro   r9   r%   r5   r]   r   r^   )rG   r   r   rL   rL   rM   _distributed_update_valid_loss  s    z/InferenceNetwork._distributed_update_valid_lossg-C6?g?gh㈵>iX  i'  r   Fc       4      O   s   | j s*| j| j|dd |   d| _ |d kr<d}d}ntj|d t }t }t	
||d ttddd	gd
 ttd|dd	gd
 ttd|dd	gd
 ttd|| |dd	gd
 ttd|| |dd	gd
 ttdt|dd	gd
 || _|| _|| _|| _|| | _|| _|   | j}t }t }t }|d krrtd|d }|d kr|}| d }d}d}d}d}td d}d}d} t | }!t|tr|dkrt|t||dd|dd d}"nt||ddd d}"|d k	r"t||ddd d}#x|s|d7 }xt|"D ]\}$}%|dkrj|| dkrj|   | jrxd}&n
|  |%}&| j!d ks|&r|t"j#krt$j%| & || |	d| _!nt$j'| & || |d|	d| _!|dkr| (  n
| j!)  | *|%\}'}(|'s4ttd|(dd	gd
 |rd S n|(+  |dkrP| ,| | j!-  t.|(}(| j/d krz|(| _/|(| _0d| j/})|(| j1k r|(| _1td|(d d	gd
}*td| j1d d	gd
}t }tt	2dd d	gd
} n|(| j0kr|(| _0td|(dd	gd
}*nd|(| j3k r.td|(d }*n(|(| j3krLtd|(d}*n
d|(}*d| j1}t	2t | } |(| _3|  j4d7  _4||%j57 }|  j6|%j5| 7  _6d!d"| j6}+d#d"|},|t |  | _t	2| j}-d$t7|%j5| t |  }.t }|d k	r&||kr&d}| j89|( | j:9| j6 |d k	r|| |krtd%d&d' d}/t;< 4 x,t|#D ] \}$}%| *|%\}0}1|/|17 }/q|W W d Q R X t.|/t=| }/| j>9|/ | j?9| j6 |d }|dkr| @|(| | A|/| |dkr"|| dkr"| @|(| |dkrz|
d k	rzt |! |krzt }!d(|
t	B | j6}2td)d&d' | C|2 d*|-|,|+|)||*| |.}3tt=|3|}t|3D|d&d' tEjFG  |rP |d7 }q@W q&W t  |dkr|
d k	rd(|
t	B | j6}2td)d&d' | C|2 d S )+Nr   )rd   Tr   )backendFz Distributed synchronous trainingr   r   )r   zDistributed backend       : {}zDistributed world size    : {}z6Distributed minibatch size: {} (global), {} (per node)z2Distributed learning rate : {} (global), {} (base)zDistributed optimizer     : {}d   i  z]Train. time | Epoch| Trace     | Init. loss| Min. loss | Curr. loss| T.since min | Traces/secr   )r   r   c             S   s   t | S )N)r   )r   rL   rL   rM   r   ?  s    z+InferenceNetwork.optimize.<locals>.<lambda>)batch_samplerr   r   c             S   s   t | S )N)r   )r   rL   rL   rM   r   A  s    )r   r   r   c             S   s   t | S )N)r   )r   rL   rL   rM   r   C  s    )lrweight_decay)r   momentumnesterovr   z-Cannot compute loss, skipping batch. Loss: {}r   z{:+.2e}greenz{:9}z{:,}z{:4}z{:,.1f}zComputing validation loss...  r   )r   z{}_{}_traces_{}.networkzSaving to disk...  z%{} | {} | {} | {} | {} | {} | {} | {})Hr   rh   r   r   rw   r   init_process_groupget_world_sizeget_rankr   init_distributed_printr]   r   r^   r   rE   rF   rA   rD   rB   rC   trainr$   timemax
isinstancer   r   r	   r   r   r   ry   r#   r   ADAMoptimAdamr   SGDr   	zero_gradr|   backwardr   stepr(   r'   r*   r)   days_hours_mins_secs_strr+   r&   r\   r%   intr,   ro   r-   r>   no_gradrV   r.   r/   r   r   get_time_stampr   ljustsysstdoutflush)4rG   
num_tracesr   dataset_validr   valid_everyoptimizer_typelearning_rater   r   r   save_every_secdistributed_backendZdistributed_params_sync_everyZdistributed_loss_update_everydataloader_offline_num_workersstop_with_bad_lossr   r   Zdistributed_world_sizeZdistributed_rankZprev_total_train_seconds
time_startZtime_loss_minZtime_last_batchZlast_validation_traceepoch	iterationrj   stopZmax_print_line_lenZloss_min_strZtime_since_loss_min_strZlast_auto_save_timer   Zdataloader_validr   rx   r   successr   Zloss_initial_strZloss_strZtotal_train_traces_strZ	epoch_strZtotal_training_seconds_strZtraces_per_second_strZ
valid_loss_vr   Z
print_linerL   rL   rM   optimize  s     



 










 






zInferenceNetwork.optimize)N)N)NN)N)r   N)__name__
__module____qualname__r   rh   rs   ru   rw   ry   r{   r|   r   staticmethodr   rb   r   r   r   r   r   r   r   r   r  __classcell__rL   rL   )rK   rM   r      s$   /-
	
	
"


r   )$r>   torch.nnr   torch.optimr   torch.distributeddistributedr   torch.utils.datar   r   r   r   r   r   r   r   r   	threadingr   	termcolorr   r   r   r   r   r	   r
   r   r   r   r   r   r   Moduler   rL   rL   rL   rM   <module>   s"   $