B
    #=6\-                 @   s   d dl mZmZmZmZ d dlmZmZ d dlZ	G dd de
ZG dd de
ZG dd	 d	eZG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    )absolute_importdivisionprint_functionunicode_literals)coreutilsNc               @   s   e Zd ZdZdZdS )RegularizationByZafter_optimizerZon_lossN)__name__
__module____qualname__ZAFTER_OPTIMIZERZON_LOSS r   r   </tmp/pip-install-l3r2oljg/torch/caffe2/python/regularizer.pyr   	   s   r   c               @   s:   e Zd Zdd ZdddZdddZdd	 ZdddZdS )Regularizerc             C   s
   d| _ d S )Ng&.>)kEpsilon)selfr   r   r   __init__   s    zRegularizer.__init__Nc             C   sv   t |tjsttt}|| ks>td| j	|| d| }t
| |sbtd| j	|t| |||||S )Nz>Regularizer of type {} is called with invalid by={}, not in {}Z_run_z5Regularizer of type {} does not implement function {})
isinstancer   ZBlobReferenceAssertionErrorr   ZEnumClassKeyValsr   valuesformat	__class__hasattrgetattr)r   netparam_init_netparamgradZbyZby_enumZrun_funcr   r   r   __call__   s    

zRegularizer.__call__c             C   s   d S )Nr   )r   r   r   r   r   r   r   r   _run_on_loss'   s    zRegularizer._run_on_lossc             C   s   d S )Nr   )r   r   r   r   r   r   r   r   _run_after_optimizer*   s    z Regularizer._run_after_optimizerFc	       
      C   st   |d k	r|s|r|| j  n|}|d k	r8|s.|r8|| j  n|}t|tjrV||j|jgn|g}	|j|	|g||d d S )N)minmax)r   r   r   GradientSliceindicesr   ZEnsureClipped)
r   r   r   r   r    r!   
open_range	left_open
right_openZinput_blobsr   r   r   _ensure_clipped-   s    zRegularizer._ensure_clipped)NN)N)NNNFFF)r	   r
   r   r   r   r   r   r'   r   r   r   r   r      s   	

     r   c                   s&   e Zd Z fddZdddZ  ZS )L1Normc                s(   t t|   |dkstd|| _d S )Nr   z6factor ahead of regularization should be 0 or positive)superr(   r   r   
reg_lambda)r   r*   )r   r   r   r   K   s    zL1Norm.__init__Nc             C   s<   | |d }|j|g|gdd |j|g|g| jd |S )NZ_l1_regularization   )p)scale)NextScopedBlobLpNormScaler*   )r   r   r   r   r   output_blobr   r   r   r   Q   s    zL1Norm._run_on_loss)N)r	   r
   r   r   r   __classcell__r   r   )r   r   r(   J   s   r(   c                   s&   e Zd Z fddZdddZ  ZS )L2Normc                s(   t t|   |dkstd|| _d S )Nr   z6factor ahead of regularization should be 0 or positive)r)   r3   r   r   r*   )r   r*   )r   r   r   r   Y   s    zL2Norm.__init__Nc             C   s<   | |d }|j|g|gdd |j|g|g| jd |S )NZ_l2_regularization   )r,   )r-   )r.   r/   r0   r*   )r   r   r   r   r   r1   r   r   r   r   _   s    zL2Norm._run_on_loss)N)r	   r
   r   r   r   r2   r   r   )r   r   r3   X   s   r3   c                   s&   e Zd Zd fdd	Zdd Z  ZS )MaxNorm      ?c                s   t t|   || _d S )N)r)   r5   r   norm)r   r7   )r   r   r   r   g   s    zMaxNorm.__init__c             C   sL   | j dkstdt|tjr@|j||j|jg|gd| j d ntdd S )Nr   znorm should be bigger than 0.T)use_max_normr7   z-MaxNorm is not supported for dense parameters)	r7   r   r   r   r"   SparseNormalizer#   r   NotImplementedError)r   r   r   r   r   r   r   r   r   k   s    zMaxNorm._run_after_optimizer)r6   )r	   r
   r   r   r   r2   r   r   )r   r   r5   f   s   r5   c                   s&   e Zd Zd fdd	Zdd Z  ZS )ConstantNorm      ?c                s   t t|   || _d S )N)r)   r;   r   r7   )r   r7   )r   r   r   r   y   s    zConstantNorm.__init__c             C   sL   | j dkstdt|tjr@|j||j|jg|gd| j d ntdd S )Nr   znorm should be bigger than 0.F)r8   r7   z2ConstantNorm is not supported for dense parameters)	r7   r   r   r   r"   r9   r#   r   r:   )r   r   r   r   r   r   r   r   r   }   s    z!ConstantNorm._run_after_optimizer)r<   )r	   r
   r   r   r   r2   r   r   )r   r   r;   x   s   r;   c                   s4   e Zd ZdZd
 fdd	ZdddZdd	 Z  ZS )
LogBarrierzr
    Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
    35(67-68), 7. Chapter 19
    invNc                s>   t t|   |dkstd|| _|| _|p6ddd| _dS )z
        discount is a positive weight that is decreasing, and here it is implemented
        similar to the learning rate. It is specified by a learning rate policy and
        corresponding options
        r   z6factor ahead of regularization should be 0 or positiveg      ?)gammapowerN)r)   r=   r   r   r*   discount_policydiscount_options)r   r*   rA   rB   )r   r   r   r      s
    zLogBarrier.__init__c             C   s   t ||}||d }|j|g|gf| j | jd| j ||d }|j|g|g| jd ||d }|	|g|g ||d }	|
|g|	g ||d }
|j|	|g|
gdd	 |
S )
NZ_log_barrier_discount)Zbase_lrpolicyZ_non_neg)r    _logZ_log_sumZ_log_barrierr+   )	broadcast)r   ZBuildUniqueMutexIterr.   ZLearningRater*   rA   rB   ZClipr   LogZSumElementsMul)r   r   r   r   r   	iterationZdiscountZparam_non_negZ	param_logZparam_log_sumr1   r   r   r   r      s"    
zLogBarrier._run_on_lossc             C   s   | j |||ddd d S )Nr   T)r    r$   )r'   )r   r   r   r   r   r   r   r   r      s    zLogBarrier._run_after_optimizer)r>   N)N)r	   r
   r   __doc__r   r   r   r2   r   r   )r   r   r=      s   
r=   c                   s*   e Zd ZdZd fdd	Zdd Z  ZS )	BoundedGradientProjectionzr
    Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
    35(67-68), 7. Chapter 16
    NFc                s   t t|   |d k	rt|nd }|d k	r2t|nd }|d k	rFt|n| j}|dksdtdj|d|d ks|d ks||r~|nd ||r|nd kstdj|||rdnd|rdnd	|d
|| _|| _|| _|| _	|| _
d S )Nr   z2Bounded Gradient Projection with invalid eps={eps})epsg        zLBounded Gradient Projection with invalid {lp}ub={ub}, lb={lb}{rp}, eps={eps}([)])lbublprprK   )r)   rJ   r   floatr   r   r   r%   r&   rP   rQ   )r   rP   rQ   r%   r&   epsilon)r   r   r   r      s*    



z"BoundedGradientProjection.__init__c          	   C   s$   | j |||| j| j| j| jd d S )N)r    r!   r%   r&   )r'   rP   rQ   r%   r&   )r   r   r   r   r   r   r   r   r      s    z.BoundedGradientProjection._run_after_optimizer)NNFFN)r	   r
   r   rI   r   r   r2   r   r   )r   r   rJ      s    rJ   c                   s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
GroupL1Norma  
    Scardapane, Simone, et al. "Group sparse regularization for deep neural networks."
    Neurocomputing 241 (2017): 81-89.

    This regularizer computes l1 norm of a weight matrix based on groups.
    There are essentially three stages in the computation:
    1. Compute the l2 norm on all the members of each group
    2. Scale each l2 norm by the size of each group
    3. Compute the l1 norm of the scaled l2 norms
    r   c                sF   t t|   |dkstdt|ts0td|| _|| _|| _dS )a  
        Args:
            reg_lambda: The weight of the regularization term.
            groups: A list of integers describing the size of each group.
                The length of the list is the number of groups.

        Optional Args:
            stabilizing_val: The computation of GroupL1Norm involves the Sqrt
                operator. When values are small, its gradient can be numerically
                unstable and causing gradient explosion. Adding this term to
                stabilize gradient calculation. Recommended value of this term is
                1e-8, but it depends on the specific scenarios. If the implementation
                of the gradient operator of Sqrt has taken into stability into
                consideration, this term won't be necessary.
        r   z-regularization weight should be 0 or positivezgroups needs to be a listN)	r)   rV   r   r   r   listr*   groupsstabilizing_val)r   r*   rX   rY   )r   r   r   r      s    
zGroupL1Norm.__init__Nc          	   C   s   | |}|j|dgdd}|||jg dt| jg| jdg}| jrl|j||jg d| jdg|gdd |	|}|
||jg t| jgt| j| j dgdg}	|j|	dgdd	}
|
S )
a  
        Args:
            param: The input blob to regularize. It should be a weight matrix
                blob with shape (output_dim, input_dim). input_dim should be
                equal to the sum of self.groups.

        Returns:
            group_l1_norm: The output blob after applying regularization.

        These are the steps of computation:
            1. square all elements
            2. sum by row
            3. lengthssum by group
            4. square_root all elements
            5. normalize each group based on group size
            6. compute l1 norm of each group
            7. scale the result with the regularization lambda
        r   )ZaxesZkeepdimsr+   )shaper   )value)rE   Znormalized_l2_norm_scaledZgroup_l1_nrom)r,   )ZSqrZ	ReduceSumZ
LengthsSumZGivenTensorIntFilllenrX   rY   ZAddZConstantFillZSqrtrG   ZGivenTensorFillnpsqrtr*   r/   )r   r   r   r   r   ZsquaredZreduced_sumZlengths_sumr^   Z	l2_scaledZgroup_l1_normr   r   r   r     s*    


zGroupL1Norm._run_on_loss)r   )N)r	   r
   r   rI   r   r   r2   r   r   )r   r   rV      s   
rV   )
__future__r   r   r   r   Zcaffe2.pythonr   r   Znumpyr]   objectr   r   r(   r3   r5   r;   r=   rJ   rV   r   r   r   r   <module>   s   <.3