Make the normalization operator more general and fix bug in l2_normalize. (#11348)

* Add normalization operator.
1. Refine the raw norm_op and let it more general to support to normalize Tensor along any axis.
2. There is a bug in l2_normalize API, which lacks sqrt after `reduce_sum`.
3. Use norm_op to refine the l2_normalize API.
4. Fix bug in test_normalization_wrapper.py.
wangkuiyi-patch-1
qingqing01 7 years ago committed by GitHub
parent f15504e57a
commit 19fd071785
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -16,40 +16,34 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename AttrType>
class NormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"X",
"(Tensor) The input tensor of norm operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of channels, H and W is the height and width of feature.");
AddInput("Scale",
"(Tensor) The input tensor of norm operator. "
"The format of input tensor is C * 1.");
AddAttr<AttrType>("epsilon",
"(float, default 1e-10) Constant "
"for numerical stability.")
AddInput("X", "(Tensor) A tensor of rank >= axis.");
AddAttr<int>("axis",
"The axis on which to apply normalization. If axis < 0, "
"the dimension to normalization is rank(X) + axis. -1 is "
"the last dimension.");
AddAttr<float>("epsilon",
"(float, default 1e-10) The epsilon value is used "
"to avoid division by zero.")
.SetDefault(1.0e-10f);
AddOutput("Out",
"(Tensor) The output tensor of norm operator."
"N * M."
"M = C * H * W");
AddOutput("Norm",
"(Tensor) A tensor saved the `sqrt(sum(x) + epsion)` will "
"be used in backward kernel.")
.AsIntermediate();
AddOutput("Out", "(Tensor) A tensor of the same shape as X.");
AddComment(R"DOC(
"Input shape: $(N, C, H, W)$
Scale shape: $(C, 1)$
Output shape: $(N, C, H, W)$
Where
forward
$$
[\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot \cdot \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
$$
backward
$$
\frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
$$
)DOC");
Given a tensor, apply 2-normalization along the provided axis.
$$
y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
$$
where, $\sum {x^2}$ is calculated along the `axis` dimension.
)DOC");
}
};
@ -58,15 +52,15 @@ class NormOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of NormOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scale"),
"Input(Scale) of NormOp"
"should not be null.");
"Input(X) of NormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of NormOp should not be null.");
auto in_x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim("Out", in_x_dims);
auto xdim = ctx->GetInputDim("X");
ctx->SetOutputDim("Out", xdim);
int axis = ctx->Attrs().Get<int>("axis");
if (axis < 0) axis = xdim.size() + axis;
xdim[axis] = 1;
ctx->SetOutputDim("Norm", xdim);
}
};
@ -84,12 +78,12 @@ class NormOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker<float>,
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
REGISTER_OP_CPU_KERNEL(
norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
REGISTER_OP_CPU_KERNEL(
norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel<CPU, float>,
ops::NormKernel<CPU, double>);
REGISTER_OP_CPU_KERNEL(norm_grad, ops::NormGradKernel<CPU, float>,
ops::NormGradKernel<CPU, double>);

@ -16,9 +16,9 @@ limitations under the License. */
#include "paddle/fluid/operators/norm_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
REGISTER_OP_CUDA_KERNEL(
norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(norm, ops::NormKernel<CUDA, float>,
ops::NormKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradKernel<CUDA, float>,
ops::NormGradKernel<CUDA, double>);

File diff suppressed because it is too large Load Diff

@ -2467,19 +2467,21 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
The l2 normalize layer normalizes `x` along dimension `axis` using an L2
norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
output = x / sqrt(max(sum(x**2), epsilon))
.. math::
y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
For `x` with more dimensions, this layer independently normalizes each 1-D
slice along dimension `axis`.
Args:
x(Variable|list): The input tensor to l2_normalize layer.
axis(int): Dimension along which to normalize the input.
epsilon(float): A lower bound value for `x`'s l2 norm. sqrt(epsilon) will
be used as the divisor if the l2 norm of `x` is less than
sqrt(epsilon).
axis(int): The axis on which to apply normalization. If `axis < 0`,
the dimension to normalization is rank(X) + axis. -1 is the
last dimension.
epsilon(float): The epsilon value is used to avoid division by zero,
the defalut value is 1e-10.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
will be named automatically.
Returns:
@ -2498,46 +2500,17 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
axis = 0
helper = LayerHelper("l2_normalize", **locals())
square = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op(type="square", inputs={"X": x}, outputs={"Out": square})
reduced_sum = helper.create_tmp_variable(dtype=x.dtype)
out = helper.create_tmp_variable(dtype=x.dtype)
norm = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op(
type="reduce_sum",
inputs={"X": square},
outputs={"Out": reduced_sum},
type="norm",
inputs={"X": x},
outputs={"Out": out,
"Norm": norm},
attrs={
"dim": [1] if axis is None else [axis],
"keep_dim": True,
"reduce_all": False
"axis": 1 if axis is None else axis,
"epsilon": epsilon,
})
# TODO(caoying) A lower bound value epsilon for the norm is needed to
# imporve the numeric stability of reciprocal. This requires a maximum_op.
rsquare = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op(
type="reciprocal", inputs={"X": reduced_sum}, outputs={"Out": rsquare})
# TODO(caoying) the current elementwise_mul operator does not support a
# general broadcast rule which broadcasts input(Y) to have the same
# dimension with Input(X) starting from a specified dimension. So this
# exanpsion is requred. Once a general broadcast rule is spported, this
# expanding canbe removed.
rsquare_expanded = helper.create_tmp_variable(dtype=x.dtype)
expand_times = [1] * len(x.shape)
expand_times[axis] = int(x.shape[axis])
helper.append_op(
type="expand",
inputs={"X": rsquare},
outputs={"Out": rsquare_expanded},
attrs={"expand_times": expand_times})
out = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op(
type="elementwise_mul",
inputs={"X": x,
"Y": rsquare_expanded},
outputs={"Out": out})
return out

@ -387,6 +387,12 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(output)
print(str(program))
def test_l2_normalize(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[8, 7, 10], dtype="float32")
output = layers.l2_normalize(x, axis=1)
def test_maxout(self):
program = Program()
with program_guard(program):

@ -17,44 +17,23 @@ import numpy as np
from op_test import OpTest
def norm(input, scale, epsilon):
s0, s1, s2, s3 = input.shape
x_square = input * input
for i in xrange(s0):
input_batch = input[i:i + 1, :, :, :]
input_batch = input_batch.reshape(s1, s2 * s3)
x_square_batch = x_square[i:i + 1, :, :, :]
x_square_batch = x_square_batch.reshape(s1, s2 * s3)
square_colsum = x_square_batch.sum(axis=0) + epsilon
tmp = pow(square_colsum, 0.5)
tmp = np.reciprocal(tmp)
tmp_tile = np.tile(tmp, s1)
tmp_tile = tmp_tile.reshape(s1, s2 * s3)
scale_tile = np.tile(scale, (1, s2 * s3))
scale_tile = scale_tile.reshape(s1, s2 * s3)
out_batch = input_batch * tmp_tile * scale_tile
out_batch = out_batch.reshape(1, s1, s2, s3)
if i == 0:
out = out_batch
else:
out = np.concatenate((out, out_batch), 0)
out.reshape(s0, s1, s2, s3)
return out
def l2_norm(x, axis, epsilon):
x2 = x**2
s = np.sum(x2, axis=axis, keepdims=True)
r = np.sqrt(s + epsilon)
y = x / np.broadcast_to(r, x.shape)
return y, r
class TestNormOp(OpTest):
def setUp(self):
self.op_type = "norm"
self.init_test_case()
input = np.random.random(self.shape).astype("float32")
scale = np.array([10, 10, 10])
self.inputs = {
'X': input.astype('float32'),
'Scale': scale.astype('float32')
}
self.attrs = {'epsilon': self.epsilon}
output = norm(input, scale, self.epsilon)
self.outputs = {'Out': output.astype('float32')}
x = np.random.random(self.shape).astype("float64")
y, norm = l2_norm(x, self.axis, self.epsilon)
self.inputs = {'X': x}
self.attrs = {'epsilon': self.epsilon, 'axis': self.axis}
self.outputs = {'Out': y, 'Norm': norm}
def test_check_output(self):
self.check_output()
@ -63,8 +42,23 @@ class TestNormOp(OpTest):
self.check_grad(['X'], 'Out')
def init_test_case(self):
self.shape = [2, 3, 2, 2]
self.epsilon = 1e-6
self.shape = [2, 3, 4, 4]
self.axis = 1
self.epsilon = 1e-8
class TestNormOp2(TestNormOp):
def init_test_case(self):
self.shape = [5, 3, 9, 7]
self.axis = 0
self.epsilon = 1e-8
class TestNormOp3(TestNormOp):
def init_test_case(self):
self.shape = [5, 3, 2, 7]
self.axis = -1
self.epsilon = 1e-8
if __name__ == '__main__':

@ -70,8 +70,9 @@ class TestNormalization(unittest.TestCase):
def l2_normalize(self, data, axis, epsilon):
""" Compute the groundtruth.
"""
output = data * np.reciprocal(
np.sum(np.square(data), axis=axis, keepdims=True))
output = data / np.broadcast_to(
np.sqrt(np.sum(np.square(data), axis=axis, keepdims=True)),
data.shape)
return output
def test_l2_normalize(self):

Loading…
Cancel
Save