|  |  |  | @ -1579,7 +1579,7 @@ def layer_norm(input, | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     **Layer Normalization** | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Assume feature vectors exist on dimensions  | 
			
		
	
		
			
				
					|  |  |  |  |     Assume feature vectors exist on dimensions | 
			
		
	
		
			
				
					|  |  |  |  |     :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics | 
			
		
	
		
			
				
					|  |  |  |  |     along these dimensions for each feature vector :math:`a` with size | 
			
		
	
		
			
				
					|  |  |  |  |     :math:`H`, then normalize each feature vector using the corresponding | 
			
		
	
	
		
			
				
					|  |  |  | @ -1600,13 +1600,13 @@ def layer_norm(input, | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         input(Variable): The input tensor variable. | 
			
		
	
		
			
				
					|  |  |  |  |         scale(bool): Whether to learn the adaptive gain :math:`g` after  | 
			
		
	
		
			
				
					|  |  |  |  |         scale(bool): Whether to learn the adaptive gain :math:`g` after | 
			
		
	
		
			
				
					|  |  |  |  |             normalization. | 
			
		
	
		
			
				
					|  |  |  |  |         shift(bool): Whether to learn the adaptive bias :math:`b` after  | 
			
		
	
		
			
				
					|  |  |  |  |         shift(bool): Whether to learn the adaptive bias :math:`b` after | 
			
		
	
		
			
				
					|  |  |  |  |             normalization. | 
			
		
	
		
			
				
					|  |  |  |  |         begin_norm_axis(bool): The normalization will be performed along  | 
			
		
	
		
			
				
					|  |  |  |  |         begin_norm_axis(bool): The normalization will be performed along | 
			
		
	
		
			
				
					|  |  |  |  |             dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`. | 
			
		
	
		
			
				
					|  |  |  |  |         epsilon(float): The small value added to the variance to prevent  | 
			
		
	
		
			
				
					|  |  |  |  |         epsilon(float): The small value added to the variance to prevent | 
			
		
	
		
			
				
					|  |  |  |  |             division by zero. | 
			
		
	
		
			
				
					|  |  |  |  |         param_attr(ParamAttr|None): The parameter attribute for the learnable | 
			
		
	
		
			
				
					|  |  |  |  |             gain :math:`g`. | 
			
		
	
	
		
			
				
					|  |  |  | @ -2070,7 +2070,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): | 
			
		
	
		
			
				
					|  |  |  |  |             Tensor variable with a single element, otherwise must be in the | 
			
		
	
		
			
				
					|  |  |  |  |             range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, | 
			
		
	
		
			
				
					|  |  |  |  |             the dimension to reduce is :math:`rank + dim`. | 
			
		
	
		
			
				
					|  |  |  |  |         keep_dim (bool): Whether to reserve the reduced dimension in the | 
			
		
	
		
			
				
					|  |  |  |  |         keep_dim (bool|False): Whether to reserve the reduced dimension in the | 
			
		
	
		
			
				
					|  |  |  |  |             output Tensor. The result tensor will have one fewer dimension | 
			
		
	
		
			
				
					|  |  |  |  |             than the :attr:`input` unless :attr:`keep_dim` is true. | 
			
		
	
		
			
				
					|  |  |  |  |         name(str|None): A name for this layer(optional). If set None, the layer | 
			
		
	
	
		
			
				
					|  |  |  | @ -3098,33 +3098,33 @@ def multiplex(inputs, index): | 
			
		
	
		
			
				
					|  |  |  |  | def softmax_with_cross_entropy(logits, label, soft_label=False): | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     **Softmax With Cross Entropy Operator.** | 
			
		
	
		
			
				
					|  |  |  |  |      | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Cross entropy loss with softmax is used as the output layer extensively. This | 
			
		
	
		
			
				
					|  |  |  |  |     operator computes the softmax normalized values for each row of the input | 
			
		
	
		
			
				
					|  |  |  |  |     tensor, after which cross-entropy loss is computed. This provides a more | 
			
		
	
		
			
				
					|  |  |  |  |     numerically stable gradient. | 
			
		
	
		
			
				
					|  |  |  |  |      | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Because this operator performs a softmax on logits internally, it expects | 
			
		
	
		
			
				
					|  |  |  |  |     unscaled logits. This operator should not be used with the output of | 
			
		
	
		
			
				
					|  |  |  |  |     softmax operator since that would produce incorrect results. | 
			
		
	
		
			
				
					|  |  |  |  |      | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     When the attribute soft_label is set false, this operators expects mutually | 
			
		
	
		
			
				
					|  |  |  |  |     exclusive hard labels, each sample in a batch is in exactly one class with a | 
			
		
	
		
			
				
					|  |  |  |  |     probability of 1.0. Each sample in the batch will have a single label. | 
			
		
	
		
			
				
					|  |  |  |  |      | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     The equation is as follows: | 
			
		
	
		
			
				
					|  |  |  |  |      | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     1) Hard label (one-hot label, so every sample has exactly one class) | 
			
		
	
		
			
				
					|  |  |  |  |      | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     .. math:: | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |         loss_j =  -\\text{logit}_{label_j} + | 
			
		
	
		
			
				
					|  |  |  |  |         \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logit}_i)\\right), j = 1,..., K | 
			
		
	
		
			
				
					|  |  |  |  |      | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     2) Soft label (each sample can have a distribution over all classes) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     .. math:: | 
			
		
	
		
			
				
					|  |  |  |  |      | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |         loss_j =  -\\sum_{i=0}^{K}\\text{label}_i | 
			
		
	
		
			
				
					|  |  |  |  |         \\left(\\text{logit}_i - \\log\\left(\\sum_{i=0}^{K} | 
			
		
	
		
			
				
					|  |  |  |  |         \\exp(\\text{logit}_i)\\right)\\right), j = 1,...,K | 
			
		
	
	
		
			
				
					|  |  |  | @ -3169,7 +3169,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None): | 
			
		
	
		
			
				
					|  |  |  |  |     The operator takes the first dimension of X and Y as batch size. | 
			
		
	
		
			
				
					|  |  |  |  |     For each instance, it computes the smooth l1 loss element by element first | 
			
		
	
		
			
				
					|  |  |  |  |     and then sums all the losses. So the shape of Out is [batch_size, 1]. | 
			
		
	
		
			
				
					|  |  |  |  |      | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         x (Variable): A tensor with rank at least 2. The input value of smooth | 
			
		
	
		
			
				
					|  |  |  |  |             l1 loss op with shape [batch_size, dim1, ..., dimN]. | 
			
		
	
	
		
			
				
					|  |  |  | 
 |