B
    \}\4}                 @   s  d dl Z d dlmZ d dlmZ d dlm  mZ d dlmZ	 d dl
mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ ddlmZmZmZmZmZmZm Z  ddl!m"Z" dd	lm#Z#m$Z$m%Z%m&Z&m'Z' G d
d dej(Z)dS )    N)
DataLoader)Thread)colored   )BatchOfflineDatasetTraceBatchSamplerDistributedTraceBatchSamplerEmbeddingFeedForwardEmbeddingCNN2D5CEmbeddingCNN3D5C)LARC   )__version__util	OptimizerLearningRateSchedulerObserveEmbeddingc                   s   e Zd Zi df fdd	Zdd Zd4ddZd5d	d
Zdd Zdd Zd6ddZ	dd Z
dd Zedd Zd7 fdd	Zd8ddZdd Zdd Zd d! Zd"d# Zd9d$d%Zd:d&d'Zdd(ddejd)d*ejd+d,dd-dd.d/d0d1dfd2d3Z  ZS );InferenceNetwork c                sV  t    || _t | _d | _d| _d| _|| _	d | _
d | _i | _d | _d | _d | _d | _d | _d | _d | _d | _d| _d| _d | _d| _d | _d | _d | _td| _d | _td| _g | _ g | _!g | _"g | _#g | _$g | _%t&'d| _(t&'d| _)g | _*g | _+g | _,g | _-t&. | _/d| _0d| _1t23d| _4d | _5d | _d | _6d | _7d | _8|| _9d S )NFr   infg        cpu):super__init___modelnn
ModuleDict_layers_observe_embedding_layers_observe_embedding_final_layers_pre_generated_layers_initialized_observe_embeddings_observe_embedding_dim_infer_observe_infer_observe_embedding
_optimizer_optimizer_type_optimizer_state	_momentum_weight_decay_learning_rate_scheduler_learning_rate_scheduler_type_learning_rate_scheduler_state_total_train_seconds_total_train_traces_total_train_traces_end_total_train_iterations_learning_rate_init_learning_rate_end
_loss_initfloat	_loss_min	_loss_max_loss_previous_history_train_loss_history_train_loss_trace_history_valid_loss_history_valid_loss_trace_history_num_paramsZ_history_num_params_tracer   	to_tensor_distributed_train_loss_distributed_valid_loss_distributed_history_train_loss%_distributed_history_train_loss_trace_distributed_history_valid_loss%_distributed_history_valid_loss_traceget_time_str	_modified_updates_on_cudatorchdevice_deviceZ_learning_rateZ_batch_size_distributed_backend_distributed_world_sizeZ_network_type)selfmodelobserve_embeddingsnetwork_type)	__class__ _/global/project/projectdirs/dasrepo/etalumis/pyprob_master_feb28/pyprob/nn/inference_network.pyr      sd    




zInferenceNetwork.__init__c             C   s  t |dkrtdd}xB| D ]4\}}|j| }d|krPt|d }n
|j }d|krtt|d g}ntd	| tdg}d|kr|d }	ntd	| t
j}	|	t
jkrd	|kr|d	 }
ntd
	| d}
t|||
d}nB|	t
jkrt||d}n(|	t
jkr&t||d}ntd	|	|jtjd || j|< |t|7 }q$W || _td	| j t| j| jdd| _| jjtjd d S )Nr   zIAt least one observe embedding is needed to initialize inference network.reshapedimzBObservable {}: embedding dim not specified, using the default 256.   	embeddingzNObservable {}: observe embedding not specified, using the default FEEDFORWARD.depthzBObservable {}: embedding depth not specified, using the default 2.r   )input_shapeoutput_shape
num_layers)rY   rZ   zUnknown embedding: {})rI   zObserve embedding dimension: {})len
ValueErroritemsnamed_variablesrH   Sizevaluesizeprintformatr   FEEDFORWARDr
   CNN2D5Cr   CNN3D5Cr   tor   rJ   r   prodr"   r   )rM   rO   example_traceZobserve_embedding_total_dimnamera   variablerY   rZ   rW   rX   layerrR   rR   rS   _init_layers_observe_embeddingO   sD    





z/InferenceNetwork._init_layers_observe_embeddingNc                sj   g }xH| j  D ]:\ }t fdd|D t|d}||| qW tj|dd}| |}|S )Nc                s   g | ]}t |j  jqS rR   )r   r=   r_   ra   ).0trace)rk   rR   rS   
<listcomp>   s    z3InferenceNetwork._embed_observe.<locals>.<listcomp>r   )rU   )	r   r^   rH   stackviewr\   appendcatr   )rM   tracesrW   rm   valuesrR   )rk   rS   _embed_observe|   s    $
zInferenceNetwork._embed_observec             C   sd   || _ g }x:| j D ],\}}t|| dd}||| qW tj|dd}| 	|| _
d S )Nr   rr   )rU   )r#   r   r^   r   r=   rt   ru   rH   rv   r   r$   )rM   observerW   rk   rm   ra   rR   rR   rS   _infer_init   s    zInferenceNetwork._infer_initc             C   s
   t  d S )N)NotImplementedError)rM   rR   rR   rS   _init_layers   s    zInferenceNetwork._init_layersc             C   s
   t  d S )N)r|   )rM   batchrR   rR   rS   
_polymorph   s    zInferenceNetwork._polymorphc             C   s
   t  d S )N)r|   )rM   rl   Zprevious_variableproposal_min_train_iterationsrR   rR   rS   _infer_step   s    zInferenceNetwork._infer_stepc             C   s
   t  d S )N)r|   )rM   r~   rR   rR   rS   _loss   s    zInferenceNetwork._lossc                s   t  | _|  jd7  _i  t d< tj d< t|  d< d  d _d  d _| jd krfd  d _	n| j
  d _	d  d _| jd krd  d _n| j
  d _ fdd}t|d}|  |  d S )Nr   pyprob_versiontorch_versioninference_networkc                 sb   t jtt d} tj| d}t	 | t
jddd}|j|dd |  t|  d S )N)suffixpyprob_inference_networkzw:gzr   )compresslevel)arcname)tempfilemkdtempstruuiduuid4ospathjoinrH   savetarfileopenaddcloseshutilrmtree)tmp_dirZtmp_file_nametar)data	file_namerR   rS   thread_save   s    z+InferenceNetwork._save.<locals>.thread_save)target)r   rD   rE   rF   r   rH   copyr   r%   r'   
state_dictr*   r,   r   startr   )rM   r   r   trR   )r   r   rS   _save   s&    







zInferenceNetwork._savec          
   C   s  ytt | d}tjtt d}tj	|d}|
d| |  tjrVt|}ntj|dd d}t| W n2 tk
r } zt| tdW d d }~X Y nX |d tkrttd	|d td
dgd |d tjkrttd|d tjd
dgd |d }tjrl|jrN|jtjkrjttd|jtjd
dgd nttdtjd
dgd n$|jrttd|jd
dgd |jtjd t|dstd|_t|dstd|_t|dsg |_t|dsg |_ t|dsg |_!t|dsg |_"t|ds(d |_#t|ds:d |_$t|dsLd|_%t|ds^d|_&t|d spd|_'t|d!sd |_(|)|j* |+|j, |S )"Nzr:gz)r   r   c             S   s   | S )NrR   )storagelocrR   rR   rS   <lambda>   s    z(InferenceNetwork._load.<locals>.<lambda>)map_locationzCannot load inference network.r   zKWarning: different pyprob versions (loaded network: {}, current system: {})redbold)attrsr   zLWarning: different PyTorch versions (loaded network: {}, current system: {})r   z=Warning: loading CUDA (device {}) network to CUDA (device {})z0Warning: loading CPU network to CUDA (device {})z0Warning: loading CUDA (device {}) network to CPU)rI   r>   g        r?   r@   rA   rB   rC   r/   r3   r1   r   r2   r)   r+   )-r   r   r   r   r   r   r   r   r   r   extractr   r   _cuda_enabledrH   loadr   r   	Exceptionrc   RuntimeErrorr   r   rd   rG   rJ   rh   hasattrr=   r>   r?   r@   rA   rB   rC   r/   r3   r1   r2   r)   r+   _create_optimizerr'   _create_lr_schedulerr,   )r   r   r   Ztmp_filer   eretrR   rR   rS   _load   sj     ""zInferenceNetwork._loadc                s.   || _ dt|k| _t j||d|i d S )NcudarI   )rJ   r   rG   r   rh   )rM   rI   argskwargs)rQ   rR   rS   rh      s    zInferenceNetwork.to@   c       
      C   s   | j s*| j| j|dd |   d| _ d| _t||dddd d}tdt	|d	 d}x`t
|D ]T\}}|t	|7 }| |}t| |rf|d k	rfd
|}	tddd | |	 qfW td d S )Nr   )rj   TFc             S   s   t | S )N)r   )xrR   rR   rS   r   	  s    z7InferenceNetwork._pre_generate_layers.<locals>.<lambda>)
batch_sizeshufflenum_workers
collate_fnzLayer pre-generation...Tracesz!{}_00000000_pre_generated.networkzSaving to disk...  )endzLayer pre-generation complete)r    rn   r!   __getitem__r}   r   r   r   progress_bar_initr\   	enumerater   progress_bar_updaterd   rc   r   progress_bar_end)
rM   datasetr   save_file_name_prefix
dataloaderii_batchr~   layers_changedr   rR   rR   rS   _pre_generate_layers  s"    


z%InferenceNetwork._pre_generate_layersc             C   s$   x|   D ]}t|jd q
W dS )z) broadcast rank 0 parameter to all ranks r   N)
parametersdist	broadcastr   )rM   paramrR   rR   rS   _distributed_sync_parameters  s    z-InferenceNetwork._distributed_sync_parametersc       	      C   s   t dd |  D }d}yt|g W n   d}t| Y nX g }x\t|  D ]L\}}|jdk	r|||jj qZ|| rZt t	
|j|_||jj qZW |rt| nx|D ]}t| qW x|D ]}|t| }qW dS )z! all_reduce grads from all ranks c             S   s   g | ]}|j d k	rdndqS )Nr   r   )grad)ro   prR   rR   rS   rq      s    z;InferenceNetwork._distributed_sync_grad.<locals>.<listcomp>TFN)r   r=   r   r   
all_reducer   r   ru   r   rH   
zeros_liker4   )	rM   
world_sizeZttmapZpytorch_allreduce_supports_listglr   r   gZlirR   rR   rS   _distributed_sync_grad  s(    


z'InferenceNetwork._distributed_sync_gradc             C   sT   t t|| _t| j |  jt|  _| jt| j | j| j	 | jS )N)
r   r=   r4   r>   r   r   r@   ru   rA   r.   )rM   lossr   rR   rR   rS   _distributed_update_train_loss;  s    z/InferenceNetwork._distributed_update_train_lossc             C   sT   t t|| _t| j |  jt|  _| jt| j | j| j	 | jS )N)
r   r=   r4   r?   r   r   rB   ru   rC   r.   )rM   r   r   rR   rR   rS   _distributed_update_valid_lossC  s    z/InferenceNetwork._distributed_update_valid_lossc             C   s   | j d krd S | j tjtjgkr<tj|  | j| jd| _	n tj
|  | j| jd| jd| _	| j tjtjgkrzt| j	| _	|d k	r| j	| d S )N)lrweight_decayT)r   momentumnesterovr   )r&   r   ADAM	ADAM_LARCoptimAdamr   r1   r)   r%   SGDr(   SGD_LARCr   load_state_dict)rM   r   rR   rR   rS   r   K  s    
 z"InferenceNetwork._create_optimizerc                s   | j d krd S | j }| j| j| jfdd | jd krHd | _nV|tjkrptj	| j fddd| _n.|tj
krtj	| j fddd| _nd | _| jd k	r|d k	r| j| d S )Nc                s    d|    |   S )Nr   rR   )iterpower)iter_endlr_endlr_initrR   rS   _poly_decayb  s    z:InferenceNetwork._create_lr_scheduler.<locals>._poly_decayc                s    | dd S )Ng      ?)r   rR   )r   )r   r   rR   rS   r   h  s    z7InferenceNetwork._create_lr_scheduler.<locals>.<lambda>)	lr_lambdac                s    | dd S )Ng       @)r   rR   )r   )r   r   rR   rS   r   j  s    )r+   r/   r1   r2   r%   r*   r   POLY1lr_schedulerLambdaLRPOLY2r   )rM   r   learning_rate_scheduler_typerR   )r   r   r   r   rS   r   Y  s     



z%InferenceNetwork._create_lr_schedulerg    eAg-C6?gư>g?gh㈵>iX  i'  
   r   Fc       <      C   s	  | j s*| j| j|dd |   d| _ |d kr<d}d}n(tj|d t }t }|| _	|| _
t|tr|dkrt|t||dd|dd d	}qt|t|||ddd
|dd d	}nt||ddd d}|d k	rN|dkrt|t||dd|dd d	}n"t|t|||ddd
|dd d	}| jsNx t|D ]\}}| | q6W |dkr\t||d |dkr\ttdddgd ttd|ddgd ttd|ddgd ttd|| |ddgd ttd|t| |ddgd ttdt|ddgd ttdt|ddgd ttdt|jjddgd |   | j}t   }|}|}|d krt!d|d }| d }d} | j"d kr|| _"| j#d kr|| _#| j$d kr|| _$| j%d kr|
| _%| j&d kr|t| | _&| j'd kr|	| _'| j(d kr(|| _(d}!d}"d}#td d}$d}%d}&| j)d krXdn
d | j)}'|| }(|tj* })|dkr|d k	rt+|d!dd"}*|*,d# x|#s|!d7 }!xt|D ]\}}t   }+|dkr| j-| dkr| .  | jrd},n
| |},| j/d ks|,r.| 0  | 1  | j/2  | 3|\}-}.|-stttd$|.d%dgd |rd S q|.4  |dkr| 5| | j/6  t7|.}.|dkr| 8|.|}.| j)d kr|.| _)|.| _9d | j)}'|.| j:k r8|.| _:td |.d&dgd}/td | j:d&dgd}%|+}tt;dd&dgd}&n|.| j9krb|.| _9td |.d%dgd}/n`|.| j<k rtd |.d&}/n(|.| j<krtd |.d%}/n
d |.}/d | j:}%t;|+| }&|.| _<|  j-d7  _-|"|j=| 7 }"|  j>|j=| 7  _>||+|  | _| j?@|. | jA@| j> |j=| |+|  }0|d k	r|"| |krtd' d} tBC 4 x,t|D ] \}}| 3|\}1}2| |27 } qjW W d Q R X t7| t| } |dkr| D| |} | jE@|  | jF@| j> |"d }|dkr0|d k	r0|+|( |kr0|+}(d(|tG | j>}3td)d*d+ | H|3 |+}|"|krPtd,| d}#| j>| j(krrtd-| j( d}#| jId k	r| jI6| j> | j/jJd d. }4d |4}5|+|) tj*ks|#r<|+})t;| j}6d/d0|!}7d1d0| j>}8d2|0}9d3|6|7|8|'|%|/|&|5|9	}:t!t|:|$}$t|:K|$d*d+ tLjMN  |dkr|d k	rd };t|jtrj|jjO};|*,d4| j| j-| j>|.| |4|jPt|jQ|;|0
 |#rP qW qW |dkr|d k	r|*R  t  |dk	r|d k		rd(|tG | j>}3td)d*d+ | H|3 d S )5Nr   )rj   Tr   )backend)r   shuffle_batchesc             S   s   t | S )N)r   )r   rR   rR   rS   r     s    z+InferenceNetwork.optimize.<locals>.<lambda>)batch_samplerr   r   )r   num_bucketsr   shuffle_bucketsc             S   s   t | S )N)r   )r   rR   rR   rS   r     s    c             S   s   t | S )N)r   )r   rR   rR   rS   r     s    )r   r   r   c             S   s   t | S )N)r   )r   rR   rR   rS   r     s    c             S   s   t | S )N)r   )r   rR   rR   rS   r     s    Fz Distributed synchronous trainingyellowr   )r   zDistributed backend        : {}zDistributed world size     : {}zADistributed minibatch size : {} (global effective), {} (per rank)z3Distributed init.learn rate: {} (global), {} (base)zDistributed optimizer      : {}z!Distributed dataset size   : {:,}z!Distributed num. buckets   : {:,}d   i  ziTrain. time | Epoch| Trace     | Init. loss| Min. loss | Curr. loss| T.since min | Learn.rate| Traces/secr   z{:+.2e}w)mode	bufferingztime, iteration, trace, loss, valid_loss, learning_rate, mean_trace_length_controlled, sub_mini_batches, distributed_bucket_id, traces_per_second
z-Cannot compute loss, skipping batch. Loss: {}r   greenzComputing validation lossz{}_{}_traces_{}.networkzSaving to disk...  r   )r   z'
Stop condition reached. num_traces: {}z+
Stop condition reached. num_traces_end: {}r   z{:4}z{:,}z{:9}z{:,.1f}z+{} | {} | {} | {} | {} | {} | {} | {} | {} z'{}, {}, {}, {}, {}, {}, {}, {}, {}, {}
)Sr    rn   r!   r   r}   r   init_process_groupget_world_sizeget_rankrK   rL   
isinstancer   r   r   r	   r   r   r   r   init_distributed_printrc   r   rd   mathsqrtr   r\   r   _bucketstrainr-   timemaxr&   r(   r)   r+   r1   r2   r/   r3   _print_refresh_rater   writer0   r   r%   r   r   	zero_gradr   backwardr   stepr4   r   r6   r5   days_hours_mins_secs_strr7   rb   r.   r8   ru   r9   rH   no_gradr   r:   r;   get_time_stampr   r*   param_groupsljustsysstdoutflush_current_bucket_idmean_length_controlledsub_batchesr   )<rM   
num_tracesr   dataset_validnum_traces_endr   valid_everyoptimizer_typelearning_rate_initlearning_rate_endr   r   r   r   save_every_secdistributed_backend"distributed_params_sync_every_iterdistributed_num_bucketsdataloader_offline_num_workersstop_with_bad_losslog_file_nameZdistributed_world_sizeZdistributed_rankr   Zdataloader_validr   r~   Zprev_total_train_seconds
time_startZtime_loss_minZtime_last_batchZlast_validation_traceZ
valid_lossepochrp   stopZmax_print_line_lenZloss_min_strZtime_since_loss_min_strZloss_init_strZlast_auto_save_timeZ
last_printZlog_fileZ
time_batchr   successr   Zloss_strtraces_per_second_vr   Zlearning_rate_currentZlearning_rate_current_strZtotal_training_seconds_strZ	epoch_strZtotal_train_traces_strZtraces_per_second_strZ
print_line	bucket_idrR   rR   rS   optimizeq  sf   
 $

 "

 &"



















0zInferenceNetwork.optimize)N)N)NN)N)r   N)N)N)__name__
__module____qualname__r   rn   ry   r{   r}   r   r   r   r   staticmethodr   rh   r   r   r   r   r   r   r   r   r   r   NONEr2  __classcell__rR   rR   )rQ   rS   r      s&   5-
	
	
!B


r   )*rH   torch.nnr   torch.optimr   Ztorch.optim.lr_schedulerr   torch.distributeddistributedr   torch.utils.datar   r  r
  r   r   r   r   r   r   r  	threadingr   	termcolorr   r   r   r   r   r	   r
   r   r   Zoptimizer_larcr   r   r   r   r   r   Moduler   rR   rR   rR   rS   <module>   s(   $