B
    -]                 @   s  d dl Z d dlmZ d dlmZ d dlm  mZ d dlmZ	 d dl
mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ ddlmZmZmZmZmZmZm Z  ddl!m"Z" dd	lm#Z#m$Z$m%Z%m&Z&m'Z' G d
d dej(Z)dS )    N)
DataLoader)Thread)colored   )BatchOfflineDatasetTraceBatchSamplerDistributedTraceBatchSamplerEmbeddingFeedForwardEmbeddingCNN2D5CEmbeddingCNN3D5C)LARC   )__version__util	OptimizerLearningRateSchedulerObserveEmbeddingc                   s   e Zd Zi df fdd	Zdd Zd4ddZd5d	d
Zdd Zdd Zd6ddZ	dd Z
dd Zedd Zd7 fdd	Zd8ddZdd Zdd Zd d! Zd"d# Zd9d$d%Zd:d&d'Zdd(ddejd)d*ejd+d,dd-dd.d/d0d1dfd2d3Z  ZS );InferenceNetwork c                sV  t    || _t | _d | _d| _d| _|| _	d | _
d | _i | _d | _d | _d | _d | _d | _d | _d | _d | _d| _d| _d | _d| _d | _d | _d | _td| _d | _td| _g | _ g | _!g | _"g | _#g | _$g | _%t&'d| _(t&'d| _)g | _*g | _+g | _,g | _-t&. | _/d| _0d| _1t23d| _4d | _5d | _d | _6d | _7d | _8|| _9d S )NFr   infg        cpu):super__init___modelnn
ModuleDict_layers_observe_embedding_layers_observe_embedding_final_layers_pre_generated_layers_initialized_observe_embeddings_observe_embedding_dim_infer_observe_infer_observe_embedding
_optimizer_optimizer_type_optimizer_state	_momentum_weight_decay_learning_rate_scheduler_learning_rate_scheduler_type_learning_rate_scheduler_state_total_train_seconds_total_train_traces_total_train_traces_end_total_train_iterations_learning_rate_init_learning_rate_end
_loss_initfloat	_loss_min	_loss_max_loss_previous_history_train_loss_history_train_loss_trace_history_valid_loss_history_valid_loss_trace_history_num_params_history_num_params_tracer   	to_tensor_distributed_train_loss_distributed_valid_loss_distributed_history_train_loss%_distributed_history_train_loss_trace_distributed_history_valid_loss%_distributed_history_valid_loss_traceget_time_str	_modified_updates_on_cudatorchdevice_deviceZ_learning_rateZ_batch_size_distributed_backend_distributed_world_sizeZ_network_type)selfmodelobserve_embeddingsnetwork_type)	__class__ \/global/project/projectdirs/dasrepo/etalumis/pyprob_saeid-dev/pyprob/nn/inference_network.pyr      sd    




zInferenceNetwork.__init__c             C   s  t |dkrtdd}x| D ]\}}|j| }d|kr`t|d }td|| n|j	 }td|| d|krt|d g}td|| ntd| td	g}d
|kr|d
 }	td||	 ntd| t
j}	|	t
jkrFd|kr$|d }
td||
 ntd| d}
t|||
d}nB|	t
jkr`t||d}n(|	t
jkrzt||d}ntd|	|jtjd || j|< |t|7 }q$W || _td| j t| j| jdd| _| jjtjd d S )Nr   zIAt least one observe embedding is needed to initialize inference network.reshapezObservable {}: reshape to {}.z5Observable {}: reshape not specified, using shape {}.dimz&Observable {}: using embedding dim {}.zBObservable {}: embedding dim not specified, using the default 256.   	embeddingz*Observable {}: using observe embedding {}.zNObservable {}: observe embedding not specified, using the default FEEDFORWARD.depthz(Observable {}: using embedding depth {}.zBObservable {}: embedding depth not specified, using the default 2.r   )input_shapeoutput_shape
num_layers)rZ   r[   zUnknown embedding: {})rJ   zObserve embedding dimension: {})len
ValueErroritemsnamed_variablesrI   Sizeprintformatvaluesizer   FEEDFORWARDr
   CNN2D5Cr   CNN3D5Cr   tor   rK   r   prodr"   r   )rN   rP   example_traceZobserve_embedding_total_dimnamerd   variablerZ   r[   rX   rY   layerrS   rS   rT   _init_layers_observe_embeddingO   sN    



z/InferenceNetwork._init_layers_observe_embeddingNc                sj   g }xH| j  D ]:\ }t fdd|D t|d}||| qW tj|dd}| |}|S )Nc                s   g | ]}t |j  jqS rS   )r   r>   r`   rd   ).0trace)rl   rS   rT   
<listcomp>   s    z3InferenceNetwork._embed_observe.<locals>.<listcomp>r   )rV   )	r   r_   rI   stackviewr]   appendcatr   )rN   tracesrX   rn   valuesrS   )rl   rT   _embed_observe   s    $
zInferenceNetwork._embed_observec             C   sd   || _ g }x:| j D ],\}}t|| dd}||| qW tj|dd}| 	|| _
d S )Nr   rs   )rV   )r#   r   r_   r   r>   ru   rv   rI   rw   r   r$   )rN   observerX   rl   rn   rd   rS   rS   rT   _infer_init   s    zInferenceNetwork._infer_initc             C   s
   t  d S )N)NotImplementedError)rN   rS   rS   rT   _init_layers   s    zInferenceNetwork._init_layersc             C   s
   t  d S )N)r}   )rN   batchrS   rS   rT   
_polymorph   s    zInferenceNetwork._polymorphc             C   s
   t  d S )N)r}   )rN   rm   Zprevious_variableproposal_min_train_iterationsrS   rS   rT   _infer_step   s    zInferenceNetwork._infer_stepc             C   s
   t  d S )N)r}   )rN   r   rS   rS   rT   _loss   s    zInferenceNetwork._lossc                s   t  | _|  jd7  _i  t d< tj d< t|  d< d  d _d  d _| jd krfd  d _	n| j
  d _	d  d _| jd krd  d _n| j
  d _ fdd}t|d}|  |  d S )Nr   pyprob_versiontorch_versioninference_networkc                 sb   t jtt d} tj| d}t	 | t
jddd}|j|dd |  t|  d S )N)suffixpyprob_inference_networkzw:gzr   )compresslevel)arcname)tempfilemkdtempstruuiduuid4ospathjoinrI   savetarfileopenaddcloseshutilrmtree)tmp_dirZtmp_file_nametar)data	file_namerS   rT   thread_save   s    z+InferenceNetwork._save.<locals>.thread_save)target)r   rE   rF   rG   r   rI   copyr   r%   r'   
state_dictr*   r,   r   startr   )rN   r   r   trS   )r   r   rT   _save   s&    







zInferenceNetwork._savec          
   C   s  ytt | d}tjtt d}tj	|d}|
d| |  tjrVt|}ntj|dd d}t| W n2 tk
r } zt| tdW d d }~X Y nX |d tkrttd	|d td
dgd |d tjkrttd|d tjd
dgd |d }tjrl|jrN|jtjkrjttd|jtjd
dgd nttdtjd
dgd n$|jrttd|jd
dgd |jtjd t|dstd|_t|dstd|_t|dsg |_t|dsg |_ t|dsg |_!t|dsg |_"t|ds(d |_#t|ds:d |_$t|dsLd |_%t|ds^d |_&t|dspd |_'t|d!sd |_(t|d"sd |_)t|d#sd |_*|+|j# |,|j$ |S )$Nzr:gz)r   r   c             S   s   | S )NrS   )storagelocrS   rS   rT   <lambda>       z(InferenceNetwork._load.<locals>.<lambda>)map_locationzCannot load inference network.r   zKWarning: different pyprob versions (loaded network: {}, current system: {})redbold)attrsr   zLWarning: different PyTorch versions (loaded network: {}, current system: {})r   z=Warning: loading CUDA (device {}) network to CUDA (device {})z0Warning: loading CPU network to CUDA (device {})z0Warning: loading CUDA (device {}) network to CPU)rJ   r?   g        r@   rA   rB   rC   rD   r'   r,   r/   r3   r1   r   r2   r)   r+   )-r   r   r   r   r   r   r   r   r   r   extractr   r   _cuda_enabledrI   loadr   r   	Exceptionrb   RuntimeErrorr   r   rc   rH   rK   ri   hasattrr>   r?   r@   rA   rB   rC   rD   r'   r,   r/   r3   r1   r2   r)   r+   _create_optimizer_create_lr_scheduler)r   r   r   Ztmp_filer   eretrS   rS   rT   _load   sr     ""zInferenceNetwork._loadc                s.   || _ dt|k| _t j||d|i d S )NcudarJ   )rK   r   rH   r   ri   )rN   rJ   argskwargs)rR   rS   rT   ri     s    zInferenceNetwork.to@   c       
      C   s   | j s*| j| j|dd |   d| _ d| _t||dddd d}tdt	|d	 d}x`t
|D ]T\}}|t	|7 }| |}t| |rf|d k	rfd
|}	tddd | |	 qfW td d S )Nr   )rk   TFc             S   s   t | S )N)r   )xrS   rS   rT   r     r   z7InferenceNetwork._pre_generate_layers.<locals>.<lambda>)
batch_sizeshufflenum_workers
collate_fnzLayer pre-generation...Tracesz!{}_00000000_pre_generated.networkzSaving to disk...  )endzLayer pre-generation complete)r    ro   r!   __getitem__r~   r   r   r   progress_bar_initr]   	enumerater   progress_bar_updaterc   rb   r   progress_bar_end)
rN   datasetr   save_file_name_prefix
dataloaderii_batchr   layers_changedr   rS   rS   rT   _pre_generate_layers  s"    


z%InferenceNetwork._pre_generate_layersc             C   s$   x|   D ]}t|jd q
W dS )z) broadcast rank 0 parameter to all ranks r   N)
parametersdist	broadcastr   )rN   paramrS   rS   rT   _distributed_sync_parameters  s    z-InferenceNetwork._distributed_sync_parametersc       	      C   s   t dd |  D }d}yt|g W n   d}t| Y nX g }x\t|  D ]L\}}|jdk	r|||jj qZ|| rZt t	
|j|_||jj qZW |rt| nx|D ]}t| qW x|D ]}|t| }qW dS )z! all_reduce grads from all ranks c             S   s   g | ]}|j d k	rdndqS )Nr   r   )grad)rp   prS   rS   rT   rr   )  s    z;InferenceNetwork._distributed_sync_grad.<locals>.<listcomp>TFN)r   r>   r   r   
all_reducer   r   rv   r   rI   
zeros_liker4   )	rN   
world_sizeZttmapZpytorch_allreduce_supports_listglr   r   gZlirS   rS   rT   _distributed_sync_grad%  s(    


z'InferenceNetwork._distributed_sync_gradc             C   sT   t t|| _t| j |  jt|  _| jt| j | j| j	 | jS )N)
r   r>   r4   r?   r   r   rA   rv   rB   r.   )rN   lossr   rS   rS   rT   _distributed_update_train_lossD  s    z/InferenceNetwork._distributed_update_train_lossc             C   sT   t t|| _t| j |  jt|  _| jt| j | j| j	 | jS )N)
r   r>   r4   r@   r   r   rC   rv   rD   r.   )rN   r   r   rS   rS   rT   _distributed_update_valid_lossL  s    z/InferenceNetwork._distributed_update_valid_lossc             C   s   | j d krd S | j tjtjgkr<tj|  | j| jd| _	n tj
|  | j| jd| jd| _	| j tjtjgkrzt| j	| _	|d k	r| j	| d S )N)lrweight_decayT)r   momentumnesterovr   )r&   r   ADAM	ADAM_LARCoptimAdamr   r1   r)   r%   SGDr(   SGD_LARCr   load_state_dict)rN   r   rS   rS   rT   r   T  s    
 z"InferenceNetwork._create_optimizerc                s   | j d krd S | j }| j| j| jfdd | jd krHd | _nV|tjkrptj	| j fddd| _n.|tj
krtj	| j fddd| _nd | _| jd k	r|d k	r| j| d S )Nc                s    d|    |   S )Nr   rS   )iterpower)iter_endlr_endlr_initrS   rT   _poly_decayk  s    z:InferenceNetwork._create_lr_scheduler.<locals>._poly_decayc                s    | dd S )Ng      ?)r   rS   )r   )r   r   rS   rT   r   q  r   z7InferenceNetwork._create_lr_scheduler.<locals>.<lambda>)	lr_lambdac                s    | dd S )Ng       @)r   rS   )r   )r   r   rS   rT   r   s  r   )r+   r/   r1   r2   r%   r*   r   POLY1lr_schedulerLambdaLRPOLY2r   )rN   r   learning_rate_scheduler_typerS   )r   r   r   r   rT   r   b  s     



z%InferenceNetwork._create_lr_schedulerg    eAg-C6?gư>g?gh㈵>iX  i'  
   r   Fc       <      C   sZ	  | j s*| j| j|dd |   d| _ |d kr<d}d}n(tj|d t }t }|| _	|| _
t|tr|dkrt|t||dd|dd d	}qt|t|||ddd
|dd d	}nt||ddd d}|d k	rN|dkrt|t||dd|dd d	}n"t|t|||ddd
|dd d	}| jsNx t|D ]\}}| | q6W |dkr\t||d |dkr\ttdddgd ttd|ddgd ttd|ddgd ttd|| |ddgd ttd|t| |ddgd ttdt|ddgd ttdt|ddgd ttdt|jjddgd |   | j}t   }|}|}|d krt!d|d }| d }d} | j"d kr|| _"| j#d kr|| _#| j$d kr|| _$| j%d kr|
| _%| j&d kr|t| | _&| j'd kr|	| _'| j(d kr(|| _(d}!d}"d}#td d}$d}%d}&| j)d krXdn
d | j)}'|d k	rv|| }(|tj* })|dkr|d k	rt+|d!dd"}*|*,d# xF|#s|!d7 }!x.t|D ] \}}t   }+|dkr| j-| dkr| .  | jrd},n
| |},| j/d ks(|,r8| 0  | 1  | j/2  | 3|\}-}.|-s~ttd$|.d%dgd |rd S q|.4  |dkr| 5| | j/6  t7|.}.|dkr| 8|.|}.| j)d kr|.| _)|.| _9d | j)}'|.| j:k rB|.| _:td |.d&dgd}/td | j:d&dgd}%|+}tt;dd&dgd}&n|.| j9krl|.| _9td |.d%dgd}/n`|.| j<k rtd |.d&}/n(|.| j<krtd |.d%}/n
d |.}/d | j:}%t;|+| }&|.| _<|  j-d7  _-|"|j=| 7 }"|  j>|j=| 7  _>||+|  | _| j?@|. | jA@| j> |j=| |+|  }0|d k	r|"| |krtd' d} tBC 4 x,t|D ] \}}| 3|\}1}2| |27 } qtW W d Q R X t7| t||  } |dkr| D| |} | jE@|  | jF@| j> |"d }|dkrH|d k	rH|d k	rH|+|( |krH|+}(d(|tG | j>}3td)d*d+ | H|3 |+}|"|krhtd,| d}#| j>| j(krttd-| j(d%dgd | jId k	rttd.| j(d%dgd | jId k	r| jI6| j> | j/jJd d/ }4d |4}5|+|) tj*ks|#r|+})t;| j}6d0d1|!}7d2d1| j>}8d3|0}9d4|6|7|8|'|%|/|&|5|9	}:t!t|:|$}$t|:K|$d*d+ tLjMN  |dkr|d k	rd };t|jtr|jjO};|*,d5| j| j-| j>|.| |4|jPt|jQ|;|0
 |#rP qW qW |dk	r|d k		r|*R  t  |dk	rV|d k		rVd(|tG | j>}3td)d*d+ | H|3 d S )6Nr   )rk   Tr   )backend)r   shuffle_batchesc             S   s   t | S )N)r   )r   rS   rS   rT   r     r   z+InferenceNetwork.optimize.<locals>.<lambda>)batch_samplerr   r   )r   num_bucketsr   shuffle_bucketsc             S   s   t | S )N)r   )r   rS   rS   rT   r     r   c             S   s   t | S )N)r   )r   rS   rS   rT   r     r   )r   r   r   c             S   s   t | S )N)r   )r   rS   rS   rT   r     r   c             S   s   t | S )N)r   )r   rS   rS   rT   r     r   Fz Distributed synchronous trainingyellowr   )r   zDistributed backend        : {}zDistributed world size     : {}zADistributed minibatch size : {} (global effective), {} (per rank)z3Distributed init.learn rate: {} (global), {} (base)zDistributed optimizer      : {}z!Distributed dataset size   : {:,}z!Distributed num. buckets   : {:,}d   i  ziTrain. time | Epoch| Trace     | Init. loss| Min. loss | Curr. loss| T.since min | Learn.rate| Traces/secr   z{:+.2e}w)mode	bufferingztime, iteration, trace, loss, valid_loss, learning_rate, mean_trace_length_controlled, sub_mini_batches, distributed_bucket_id, traces_per_second
z-Cannot compute loss, skipping batch. Loss: {}r   greenz
Computing validation lossz{}_{}_traces_{}.networkzSaving to disk...  r   )r   z'
Stop condition reached. num_traces: {}zI
Stop condition reached. num_traces_end set during network generation: {}zkWarning: continuing training with learning rate scheduler beyond num_traces_end, make sure this is intendedr   z{:4}z{:,}z{:9}z{:,.1f}z+{} | {} | {} | {} | {} | {} | {} | {} | {} z'{}, {}, {}, {}, {}, {}, {}, {}, {}, {}
)Sr    ro   r!   r   r~   r   init_process_groupget_world_sizeget_rankrL   rM   
isinstancer   r   r   r	   r   r   r   r   init_distributed_printrb   r   rc   mathsqrtr   r]   r   _bucketstrainr-   timemaxr&   r(   r)   r+   r1   r2   r/   r3   _print_refresh_rater   writer0   r   r%   r   r   	zero_gradr   backwardr   stepr4   r   r6   r5   days_hours_mins_secs_strr7   re   r.   r8   rv   r9   rI   no_gradr   r:   r;   get_time_stampr   r*   param_groupsljustsysstdoutflush_current_bucket_idmean_length_controlledsub_batchesr   )<rN   
num_tracesr   dataset_validnum_traces_endr   valid_everyoptimizer_typelearning_rate_initlearning_rate_endr   r   r   r   save_every_secdistributed_backend"distributed_params_sync_every_iterdistributed_num_bucketsdataloader_offline_num_workersstop_with_bad_losslog_file_nameZdistributed_world_sizeZdistributed_rankr   Zdataloader_validr   r   Zprev_total_train_seconds
time_startZtime_loss_minZtime_last_batchZlast_validation_traceZ
valid_lossepochrq   stopZmax_print_line_lenZloss_min_strZtime_since_loss_min_strZloss_init_strZlast_auto_save_timeZ
last_printZlog_fileZ
time_batchr   successr   Zloss_strtraces_per_second_vr   Zlearning_rate_currentZlearning_rate_current_strZtotal_training_seconds_strZ	epoch_strZtotal_train_traces_strZtraces_per_second_strZ
print_line	bucket_idrS   rS   rT   optimizez  sj   
 $

 "

 &"




















0zInferenceNetwork.optimize)N)N)NN)N)r   N)N)N)__name__
__module____qualname__r   ro   rz   r|   r~   r   r   r   r   staticmethodr   ri   r   r   r   r   r   r   r   r   r   r   NONEr4  __classcell__rS   rS   )rR   rT   r      s&   52
	
	
!F


r   )*rI   torch.nnr   torch.optimr   Ztorch.optim.lr_schedulerr   torch.distributeddistributedr   torch.utils.datar   r  r  r   r   r   r   r   r   r  	threadingr   	termcolorr   r   r   r   r   r	   r
   r   r   Zoptimizer_larcr   r   r   r   r   r   Moduler   rS   rS   rS   rT   <module>   s(   $