From ca0177190f75a4f39482b8fe1c8e929ab8e1a381 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 22 Jan 2018 15:18:47 +0800 Subject: [PATCH 01/10] add layer_norm --- paddle/operators/layer_norm_op.cc | 283 ++++++++++++++++++ paddle/operators/layer_norm_op.h | 35 +++ .../v2/fluid/tests/test_layer_norm_op.py | 81 +++++ 3 files changed, 399 insertions(+) create mode 100644 paddle/operators/layer_norm_op.cc create mode 100644 paddle/operators/layer_norm_op.h create mode 100644 python/paddle/v2/fluid/tests/test_layer_norm_op.py diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc new file mode 100644 index 0000000000..f1ddcd8210 --- /dev/null +++ b/paddle/operators/layer_norm_op.cc @@ -0,0 +1,283 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/layer_norm_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenMatrixMapRowMajor = Eigen::Map< + Eigen::Matrix>; +template +using ConstEigenMatrixMapRowMajor = Eigen::Map< + const Eigen::Matrix>; + +class LayerNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), ""); + PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); + PADDLE_ENFORCE(ctx->HasInput("Bias"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); + + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], 1); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], 1); + + ctx->SetOutputDim("Y", ctx->GetInputDim("X")); + ctx->SetOutputDim("Mean", {ctx->GetInputDim("X")[0]}); + ctx->SetOutputDim("Variance", {ctx->GetInputDim("X")[0]}); + + ctx->ShareLoD("X", "Y"); + } +}; + +class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor"); + AddInput("Scale", + "Scale is a 1-dimensional tensor of size 1 " + "that is applied to the output"); + AddInput("Bias", + "Bias is a 1-dimensional tensor of size 1 " + "that is applied to the output"); + AddOutput("Y", "result after normalization"); + AddOutput("Mean", "Mean of the current mini batch."); + AddOutput("Variance", "Variance of the current mini batch."); + + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr>("axis", + "(vector default:{1, 1, 1}), the " + "axis to normalize.") + .SetDefault({1, 2, 3}); // todo(zcd) : who to set axis + + AddComment(R"DOC( +Layer Normalization. + +Layer Norm has been implemented as discussed in the paper: +https://arxiv.org/abs/1607.06450 +... +)DOC"); + } +}; + +template +class LayerNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + + const int N = x_dims[0]; + const int sample_size = x->numel() / N; + + auto scale_data = scale->data()[0]; + auto bias_data = bias->data()[0]; + + auto *output = ctx.Output("Y"); + auto *mean = ctx.Output("Mean"); + auto *var = ctx.Output("Variance"); + output->mutable_data(ctx.GetPlace()); + mean->mutable_data(ctx.GetPlace()); + var->mutable_data(ctx.GetPlace()); + + int left = N, right = sample_size; + auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); + auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); + auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); + auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); + + auto squre = [](T ele) { return ele * ele; }; + auto add_epslion = [epsilon](T ele) { return ele + epsilon; }; + + mean_map = input_map.rowwise().mean(); + var_map = (input_map - mean_map.replicate(1, right)) + .unaryExpr(squre) + .rowwise() + .mean() + .unaryExpr(add_epslion); + + auto scale_inv_std = [scale_data](T ele) { + return std::sqrt(1 / ele) * scale_data; + }; + auto sub_bias = [bias_data](T ele) { return bias_data - ele; }; + + output_map = (var_map.unaryExpr(scale_inv_std).replicate(1, right)) + .cwiseProduct(input_map) + + var_map.unaryExpr(scale_inv_std) + .cwiseProduct(mean_map) + .unaryExpr(sub_bias) + .replicate(1, right); + } +}; + +class LayerNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // check input + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); + PADDLE_ENFORCE(ctx->HasInput("Mean"), ""); + PADDLE_ENFORCE(ctx->HasInput("Variance"), ""); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), ""); + + const auto x_dims = ctx->GetInputDim("X"); + + // check output + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Scale"), {1}); + } + if (ctx->HasOutput(framework::GradVarName("Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Bias"), {1}); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + return framework::OpKernelType(framework::ToDataType(t->type()), + ctx.GetPlace()); + } +}; + +template +class LayerNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + const auto *mean = ctx.Input("Mean"); + const auto *var = ctx.Input("Variance"); + const auto *scale = ctx.Input("Scale"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + + const auto &x_dims = x->dims(); + const int N = x_dims[0]; + const int sample_size = x->numel() / N; + int left = N, right = sample_size; + + auto scale_data = scale->data()[0]; + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); + auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); + auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); + auto var_map = ConstEigenMatrixMapRowMajor(var->data(), left, 1); + + if (d_bias) { + d_bias->mutable_data(ctx.GetPlace()); + d_bias->data()[0] = d_y_map.sum(); + } + if (d_scale) { + d_scale->mutable_data(ctx.GetPlace()); + auto inv_std = [](T ele) { return std::sqrt(1 / ele); }; + d_scale->data()[0] = + ((x_map - mean_map.replicate(1, right)) + .cwiseProduct(var_map.unaryExpr(inv_std).replicate(1, right)) + .cwiseProduct(d_y_map)) + .sum(); // also can use `y` to get d_scale_map + } + + if (d_x) { + d_x->mutable_data(ctx.GetPlace()); + auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); + auto triple_product = [](T ele) { return ele * ele * ele; }; + auto neg_inv_std = [](T ele) { return T(-1.0) * std::sqrt(1 / ele); }; + auto inv_std_scale_func = [scale_data](T ele) { + return std::sqrt(1 / ele) * scale_data; + }; + auto neg_inv_std_scale_func = [scale_data](T ele) { + return T(-1.0) * std::sqrt(1 / ele) * scale_data; + }; + // dy_dx + auto dx_end = var_map.unaryExpr(inv_std_scale_func) + .replicate(1, right) + .cwiseProduct(d_y_map); + // dy_dmean_dx + auto dmean_end = var_map.unaryExpr(neg_inv_std_scale_func) + .replicate(1, right) + .cwiseProduct(d_y_map) + .rowwise() + .sum(); + auto dx_mean = (T(1.0) / right) * dmean_end.replicate(1, right); + // dy_var_dx + auto dvar_end_0 = (x_map - mean_map.replicate(1, right)) + .cwiseProduct(d_y_map) + .rowwise() + .sum(); + auto dvar_end = var_map.unaryExpr(neg_inv_std) + .unaryExpr(triple_product) + .cwiseProduct(dvar_end_0); + auto dx_var = (1.0f / right) * + (x_map - mean_map.replicate(1, right)) + .cwiseProduct(dvar_end.replicate(1, right)); + + d_x_map = dx_end + dx_mean + dx_var; + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, + layer_norm_grad, ops::LayerNormGradOp); +REGISTER_OP_CPU_KERNEL( + layer_norm, + ops::LayerNormKernel); +REGISTER_OP_CPU_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel); diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h new file mode 100644 index 0000000000..bca35b91e6 --- /dev/null +++ b/paddle/operators/layer_norm_op.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class LayerNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +template +class LayerNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py new file mode 100644 index 0000000000..73450c599d --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -0,0 +1,81 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +from op_test import OpTest + + +def layer_norm_naive(x, scale, beta, epsilon): + n, c, h, w = x.shape + mean = np.mean(x, axis=(1, 2, 3)) + var = np.var(x, axis=(1, 2, 3)) + epsilon + output = scale * np.divide((x - mean.reshape([n, 1, 1, 1])), + (np.sqrt(var)).reshape([n, 1, 1, 1])) + beta + return output, mean, var + + +class TestLayerNormdOp(OpTest): + def setUp(self): + self.init_test_case() + + input = np.random.random(self.input_size).astype("float32") + self.inputs = { + 'X': input, + 'Scale': np.array([self.scale]).astype("float32"), + 'Bias': np.array([self.bias]).astype("float32") + } + output, mean, var = layer_norm_naive(input, self.scale, self.bias, + self.epsilon) + self.outputs = {'Y': output, 'Mean': mean, 'Variance': var} + + def test_check_output(self): + self.check_output() + + # def test_check_grad(self): + # self.check_grad( + # ['Scale', 'Bias', 'X'], ['Y', 'Mean', 'Variance'], + # max_relative_error=0.02) + + def test_check_grad_no_x(self): + self.check_grad( + ['Scale', 'Bias'], ['Y', 'Mean', 'Variance'], + max_relative_error=0.02, + no_grad_set=set(['X'])) + + # def test_check_grad_no_scale(self): + # self.check_grad( + # ['Bias','X'], + # 'Y', + # max_relative_error=0.02, + # no_grad_set=set(['Scale'])) + # + # def test_check_grad_no_bias(self): + # self.check_grad( + # ['Scale','X'], + # 'Y', + # max_relative_error=0.02, + # no_grad_set=set(['Bias'])) + + def init_test_case(self): + self.op_type = "layer_norm" + self.input_size = [2, 3, 4, 5] + self.scale = 0.21 + self.bias = 0.1 + self.epsilon = 0.00001 + + +if __name__ == '__main__': + unittest.main() From ae0ea5415902f3187c7883016c3798ee5ec64fab Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 24 Jan 2018 14:14:42 +0800 Subject: [PATCH 02/10] fix unit test --- paddle/operators/layer_norm_op.cc | 11 +- .../v2/fluid/tests/test_layer_norm_op.py | 261 ++++++++++++++---- 2 files changed, 215 insertions(+), 57 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index f1ddcd8210..0808192565 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -233,13 +233,13 @@ class LayerNormGradKernel if (d_x) { d_x->mutable_data(ctx.GetPlace()); auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); - auto triple_product = [](T ele) { return ele * ele * ele; }; - auto neg_inv_std = [](T ele) { return T(-1.0) * std::sqrt(1 / ele); }; + auto triple_product = [](T ele) { return ele * ele; }; + auto neg_inv_std = [](T ele) { return -std::sqrt(1 / ele); }; auto inv_std_scale_func = [scale_data](T ele) { return std::sqrt(1 / ele) * scale_data; }; auto neg_inv_std_scale_func = [scale_data](T ele) { - return T(-1.0) * std::sqrt(1 / ele) * scale_data; + return -std::sqrt(1 / ele) * scale_data; }; // dy_dx auto dx_end = var_map.unaryExpr(inv_std_scale_func) @@ -260,10 +260,13 @@ class LayerNormGradKernel auto dvar_end = var_map.unaryExpr(neg_inv_std) .unaryExpr(triple_product) .cwiseProduct(dvar_end_0); - auto dx_var = (1.0f / right) * + auto dx_var = (T(1.0) / right) * (x_map - mean_map.replicate(1, right)) .cwiseProduct(dvar_end.replicate(1, right)); + // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) + // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) + d_x_map = dx_end + dx_mean + dx_var; } } diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index 73450c599d..4ca9754f32 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -15,66 +15,221 @@ import unittest import numpy as np +from operator import mul from op_test import OpTest +import paddle.v2.fluid.core as core +from paddle.v2.fluid.op import Operator +from paddle.v2.fluid.framework import grad_var_name -def layer_norm_naive(x, scale, beta, epsilon): - n, c, h, w = x.shape - mean = np.mean(x, axis=(1, 2, 3)) - var = np.var(x, axis=(1, 2, 3)) + epsilon - output = scale * np.divide((x - mean.reshape([n, 1, 1, 1])), - (np.sqrt(var)).reshape([n, 1, 1, 1])) + beta +def get_backward_op(scope, op, no_grad_set): + backward_op = core.Operator.backward(op, no_grad_set) + for input in backward_op.input_vars(): + var = scope.var(input) + var.get_tensor() + for output in backward_op.output_vars(): + var = scope.var(output) + var.get_tensor() + return backward_op + + +def _reference_layer_norm_naive(x, scale, beta, epsilon): + old_shape = x.shape + N = x.shape[0] + D = reduce(mul, old_shape, 1) / N + x.shape = [N, D] + mean = np.mean(x, axis=1) + var = np.var(x, axis=1) + epsilon + output = scale * np.divide((x - mean.reshape([N, 1])), + (np.sqrt(var)).reshape([N, 1])) + beta + output.shape = old_shape return output, mean, var +def _reference_layer_norm_grad(x, grad_y, scale, mean, var, epsilon): + x_shape = x.shape + N = x_shape[0] + D = reduce(mul, x_shape, 1) / N + grad_y.shape = [N, D] + x.shape = [N, D] + grad_offset = np.sum(grad_y) + mean.shape = [N, 1] + var.shape = [N, 1] + grad_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y) + + dx_end = np.sqrt(1.0 / var) * grad_y + + d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y, axis=1).reshape([N, 1]) + d_mean_1 = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape( + [N, 1]) * (-1.0 / D * np.sqrt(1.0 / var) * + np.sum(x - mean, axis=1).reshape([N, 1])).reshape([N, 1]) + d_mean = 1.0 / D * (d_mean_0 + d_mean_1) + + d_std = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape([N, 1]) * ( + 1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) + + grad_x = scale * (dx_end + d_mean + d_std) + + grad_y.shape = x_shape + x.shape = x_shape + + return grad_x, grad_scale, grad_offset + + +def create_or_get_tensor(scope, var_name, var, place): + tensor = scope.var(var_name).get_tensor() + if var is not None: + assert isinstance(var, np.ndarray) + tensor.set_lod([[]]) + tensor.set_dims(var.shape) + tensor.set(var, place) + return tensor + + +def set_output_grad(scope, outputs, place, feed_dict=None): + def __set_tensor__(name, data=None): + out_tensor = scope.find_var(name).get_tensor() + grad_tensor = scope.var(grad_var_name(name)).get_tensor() + out_dtype = out_tensor.dtype() + if data is None: + if out_dtype == core.DataType.FP64: + data = np.ones(out_tensor.shape(), dtype=np.float64) + elif out_dtype == core.DataType.FP32: + data = np.ones(out_tensor.shape(), dtype=np.float32) + else: + raise ValueError("Not supported data type " + str(out_dtype)) + grad_tensor.set(data, place) + + for output in outputs: + data = None + if output in feed_dict: + data = feed_dict[output] + __set_tensor__(output, data) + + class TestLayerNormdOp(OpTest): - def setUp(self): - self.init_test_case() - - input = np.random.random(self.input_size).astype("float32") - self.inputs = { - 'X': input, - 'Scale': np.array([self.scale]).astype("float32"), - 'Bias': np.array([self.bias]).astype("float32") - } - output, mean, var = layer_norm_naive(input, self.scale, self.bias, - self.epsilon) - self.outputs = {'Y': output, 'Mean': mean, 'Variance': var} - - def test_check_output(self): - self.check_output() - - # def test_check_grad(self): - # self.check_grad( - # ['Scale', 'Bias', 'X'], ['Y', 'Mean', 'Variance'], - # max_relative_error=0.02) - - def test_check_grad_no_x(self): - self.check_grad( - ['Scale', 'Bias'], ['Y', 'Mean', 'Variance'], - max_relative_error=0.02, - no_grad_set=set(['X'])) - - # def test_check_grad_no_scale(self): - # self.check_grad( - # ['Bias','X'], - # 'Y', - # max_relative_error=0.02, - # no_grad_set=set(['Scale'])) - # - # def test_check_grad_no_bias(self): - # self.check_grad( - # ['Scale','X'], - # 'Y', - # max_relative_error=0.02, - # no_grad_set=set(['Bias'])) - - def init_test_case(self): - self.op_type = "layer_norm" - self.input_size = [2, 3, 4, 5] - self.scale = 0.21 - self.bias = 0.1 - self.epsilon = 0.00001 + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + self.assertTrue( + np.allclose( + np.array(tensor).reshape(np_array.shape), np_array, atol=atol), + msg) + + def __assert_grad_close(self, + tensor, + np_array, + name, + place, + max_relative_error=0.02): + a = np.array(tensor).reshape(np_array.shape) + b = np_array + abs_a = np.abs(a) + abs_a[abs_a < 1e-5] = 1 + + diff_mat = np.abs(a - b) / abs_a + max_diff = np.max(diff_mat) + + def err_msg(): + offset = np.argmax(diff_mat > max_relative_error) + return ("%s Variable %s max gradient diff %f over limit %f, " + "the first error element is %d, %f, %f") % ( + "Gradient Check On %s" % str(place), name, max_diff, + max_relative_error, offset, a.flatten()[offset], + b.flatten()[offset]) + + self.assertLessEqual(max_diff, max_relative_error, err_msg()) + + def test_forward_backward(self): + def test_with_place(place, shape): + # attr + epsilon = 0.00001 + x_shape = shape + scale_shape = [1] + + x_val = np.random.random_sample(x_shape).astype(np.float32) + scale_val = np.random.random_sample(scale_shape).astype(np.float32) + bias_val = np.random.random_sample(scale_shape).astype(np.float32) + + # run forward + y_out, saved_mean, var_ref = _reference_layer_norm_naive( + x_val, scale_val, bias_val, epsilon) + + # for gradient test + # y_grad = np.ones(x_shape).astype(np.float32) * 0.00277778 + y_grad = np.random.random_sample(x_shape).astype(np.float32) + + x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad( + x_val, y_grad, scale_val, saved_mean, var_ref, epsilon) + + scope = core.Scope() + + # create input + x_tensor = create_or_get_tensor(scope, "X", x_val, place) + scale_tensor = create_or_get_tensor(scope, "Scale", scale_val, + place) + bias_tensor = create_or_get_tensor(scope, "Bias", bias_val, place) + + # create output + y_tensor = create_or_get_tensor(scope, "Y", None, place) + mean_tensor = create_or_get_tensor(scope, "Mean", None, place) + variance_tensor = create_or_get_tensor(scope, "Variance", None, + place) + + layer_norm_op = Operator( + "layer_norm", + # inputs + X="X", + Scale="Scale", + Bias="Bias", + # outputs + Y="Y", + Mean="Mean", + Variance="Variance", + # attrs + epsilon=epsilon) + + layer_norm_op.run(scope, place) + + # check forward result + if isinstance(place, core.CUDAPlace): + atol = 5e-2 + else: + atol = 1e-4 + self.__assert_close(y_tensor, y_out, "Y", atol) + self.__assert_close(mean_tensor, saved_mean, "Mean", atol) + self.__assert_close(variance_tensor, var_ref, "Variance", atol) + + # run backward + layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set()) + set_output_grad( + scope, ["Y", "Mean", "Variance"], + place, + feed_dict={"Y": y_grad}) + layer_norm_op_grad.run(scope, place) + + x_grad_tensor = create_or_get_tensor(scope, + grad_var_name("X"), None, + place) + scale_grad_tensor = create_or_get_tensor(scope, + grad_var_name("Scale"), + None, place) + bias_grad_tensor = create_or_get_tensor(scope, + grad_var_name("Bias"), None, + place) + + # check gradient output + self.__assert_grad_close(x_grad_tensor, x_grad_ref, "x_grad", place) + self.__assert_grad_close(scale_grad_tensor, scale_grad_ref, + "scale_grad", place) + self.__assert_grad_close(bias_grad_tensor, bias_grad_ref, + "bias_grad", place) + + places = [core.CPUPlace()] + if core.is_compile_gpu() and core.op_support_gpu("layer_norm"): + places.append(core.CUDAPlace(0)) + + for place in places: + test_with_place(place, [2, 3, 4, 5]) + test_with_place(place, [2, 3]) if __name__ == '__main__': From 4ce397964b788f1e9bbbe08a8e5f4d4ce21dd2f8 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Sat, 27 Jan 2018 12:31:30 +0800 Subject: [PATCH 03/10] fix unit test and c++ code --- paddle/operators/layer_norm_op.cc | 44 +++++++++---------- .../v2/fluid/tests/test_layer_norm_op.py | 19 ++++---- 2 files changed, 30 insertions(+), 33 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 0808192565..0b0c760e57 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -233,39 +233,37 @@ class LayerNormGradKernel if (d_x) { d_x->mutable_data(ctx.GetPlace()); auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); - auto triple_product = [](T ele) { return ele * ele; }; - auto neg_inv_std = [](T ele) { return -std::sqrt(1 / ele); }; + auto triple_product_func = [](T ele) { return ele * ele * ele; }; + auto scale_func = [scale_data](T ele) { return ele * scale_data; }; + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; auto inv_std_scale_func = [scale_data](T ele) { return std::sqrt(1 / ele) * scale_data; }; - auto neg_inv_std_scale_func = [scale_data](T ele) { - return -std::sqrt(1 / ele) * scale_data; - }; // dy_dx auto dx_end = var_map.unaryExpr(inv_std_scale_func) .replicate(1, right) .cwiseProduct(d_y_map); // dy_dmean_dx - auto dmean_end = var_map.unaryExpr(neg_inv_std_scale_func) - .replicate(1, right) - .cwiseProduct(d_y_map) - .rowwise() - .sum(); - auto dx_mean = (T(1.0) / right) * dmean_end.replicate(1, right); + auto dx_mean = (T(-1.0) / right) * + var_map.unaryExpr(inv_std_scale_func) + .replicate(1, right) + .cwiseProduct(d_y_map) + .rowwise() + .sum() + .replicate(1, right); // dy_var_dx - auto dvar_end_0 = (x_map - mean_map.replicate(1, right)) - .cwiseProduct(d_y_map) - .rowwise() - .sum(); - auto dvar_end = var_map.unaryExpr(neg_inv_std) - .unaryExpr(triple_product) - .cwiseProduct(dvar_end_0); - auto dx_var = (T(1.0) / right) * + auto dvar_end_part = (x_map - mean_map.replicate(1, right)) + .cwiseProduct(d_y_map) + .rowwise() + .sum(); + auto dvar_end = var_map.unaryExpr(inv_std_func) + .unaryExpr(triple_product_func) + .cwiseProduct(dvar_end_part) + .replicate(1, right); + auto dx_var = (T(-1.0) / right) * (x_map - mean_map.replicate(1, right)) - .cwiseProduct(dvar_end.replicate(1, right)); - - // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) - // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) + .cwiseProduct(dvar_end) + .unaryExpr(scale_func); d_x_map = dx_end + dx_mean + dx_var; } diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index 4ca9754f32..caa3b944eb 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -52,18 +52,19 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, epsilon): D = reduce(mul, x_shape, 1) / N grad_y.shape = [N, D] x.shape = [N, D] - grad_offset = np.sum(grad_y) mean.shape = [N, 1] var.shape = [N, 1] - grad_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y) + + d_scale = np.sum(grad_y).reshape([1, ]) + d_bias = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y).reshape([1, ]) dx_end = np.sqrt(1.0 / var) * grad_y d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y, axis=1).reshape([N, 1]) - d_mean_1 = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape( - [N, 1]) * (-1.0 / D * np.sqrt(1.0 / var) * - np.sum(x - mean, axis=1).reshape([N, 1])).reshape([N, 1]) - d_mean = 1.0 / D * (d_mean_0 + d_mean_1) + # d_mean_1 = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape( + # [N, 1]) * (-1.0 / D * np.sqrt(1.0 / var) * + # np.sum(x - mean, axis=1).reshape([N, 1])).reshape([N, 1]) + d_mean = 1.0 / D * (d_mean_0) d_std = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape([N, 1]) * ( 1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) @@ -73,7 +74,7 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, epsilon): grad_y.shape = x_shape x.shape = x_shape - return grad_x, grad_scale, grad_offset + return grad_x, d_bias, d_scale def create_or_get_tensor(scope, var_name, var, place): @@ -144,7 +145,7 @@ class TestLayerNormdOp(OpTest): epsilon = 0.00001 x_shape = shape scale_shape = [1] - + np.random.random(123) x_val = np.random.random_sample(x_shape).astype(np.float32) scale_val = np.random.random_sample(scale_shape).astype(np.float32) bias_val = np.random.random_sample(scale_shape).astype(np.float32) @@ -154,7 +155,6 @@ class TestLayerNormdOp(OpTest): x_val, scale_val, bias_val, epsilon) # for gradient test - # y_grad = np.ones(x_shape).astype(np.float32) * 0.00277778 y_grad = np.random.random_sample(x_shape).astype(np.float32) x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad( @@ -229,7 +229,6 @@ class TestLayerNormdOp(OpTest): for place in places: test_with_place(place, [2, 3, 4, 5]) - test_with_place(place, [2, 3]) if __name__ == '__main__': From 0f47703dd5db01a7510031e810f963e09a8c9c13 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Sun, 28 Jan 2018 16:41:18 +0800 Subject: [PATCH 04/10] add begin_norm_axis --- paddle/operators/layer_norm_op.cc | 47 ++++++++++++------- .../v2/fluid/tests/test_layer_norm_op.py | 28 ++++++----- 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 0b0c760e57..9e618d10d2 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -42,10 +42,17 @@ class LayerNormOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], 1); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], 1); + auto x_dim = ctx->GetInputDim("X"); + auto begin_norm_axis = ctx->Attrs().Get("begin_norm_axis"); + PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(), + "'begin_norm_axis' must be less than the rank of X"); + + auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis); + int left = static_cast(matrix_dim[0]); ctx->SetOutputDim("Y", ctx->GetInputDim("X")); - ctx->SetOutputDim("Mean", {ctx->GetInputDim("X")[0]}); - ctx->SetOutputDim("Variance", {ctx->GetInputDim("X")[0]}); + ctx->SetOutputDim("Mean", {left}); + ctx->SetOutputDim("Variance", {left}); ctx->ShareLoD("X", "Y"); } @@ -72,10 +79,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, "'epsilon' should be between 0.0 and 0.001."); }); - AddAttr>("axis", - "(vector default:{1, 1, 1}), the " - "axis to normalize.") - .SetDefault({1, 2, 3}); // todo(zcd) : who to set axis + AddAttr("begin_norm_axis", + "(int default:1), the " + "axis of `begin_norm_axis ... Rank(X) - 1` will be normalized") + .SetDefault(1) + .AddCustomChecker([](const int &begin_norm_axis) { + PADDLE_ENFORCE_GT(begin_norm_axis, 0, + "'begin_norm_axis' should be greater than zero."); + }); AddComment(R"DOC( Layer Normalization. @@ -97,9 +108,7 @@ class LayerNormKernel const auto *bias = ctx.Input("Bias"); const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); - - const int N = x_dims[0]; - const int sample_size = x->numel() / N; + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); auto scale_data = scale->data()[0]; auto bias_data = bias->data()[0]; @@ -111,7 +120,9 @@ class LayerNormKernel mean->mutable_data(ctx.GetPlace()); var->mutable_data(ctx.GetPlace()); - int left = N, right = sample_size; + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); @@ -131,7 +142,8 @@ class LayerNormKernel return std::sqrt(1 / ele) * scale_data; }; auto sub_bias = [bias_data](T ele) { return bias_data - ele; }; - + // TODO(zcd): Some thinking about output_map, is it appropriate that + // `output_map` and `input_map` point to the same memory. output_map = (var_map.unaryExpr(scale_inv_std).replicate(1, right)) .cwiseProduct(input_map) + var_map.unaryExpr(scale_inv_std) @@ -198,13 +210,14 @@ class LayerNormGradKernel const auto *var = ctx.Input("Variance"); const auto *scale = ctx.Input("Scale"); const auto *d_y = ctx.Input(framework::GradVarName("Y")); + auto scale_data = scale->data()[0]; const auto &x_dims = x->dims(); - const int N = x_dims[0]; - const int sample_size = x->numel() / N; - int left = N, right = sample_size; - auto scale_data = scale->data()[0]; + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]), + right = static_cast(matrix_dim[1]); // init output auto *d_x = ctx.Output(framework::GradVarName("X")); @@ -223,11 +236,13 @@ class LayerNormGradKernel if (d_scale) { d_scale->mutable_data(ctx.GetPlace()); auto inv_std = [](T ele) { return std::sqrt(1 / ele); }; + // There are two equation to compute d_scale. One uses "Y" and the other + // does not use "Y" d_scale->data()[0] = ((x_map - mean_map.replicate(1, right)) .cwiseProduct(var_map.unaryExpr(inv_std).replicate(1, right)) .cwiseProduct(d_y_map)) - .sum(); // also can use `y` to get d_scale_map + .sum(); } if (d_x) { diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index caa3b944eb..8ce327436f 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import unittest import numpy as np @@ -33,23 +32,24 @@ def get_backward_op(scope, op, no_grad_set): return backward_op -def _reference_layer_norm_naive(x, scale, beta, epsilon): +def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): old_shape = x.shape - N = x.shape[0] - D = reduce(mul, old_shape, 1) / N + N = reduce(mul, old_shape[0:begin_norm_axis], 1) + D = reduce(mul, old_shape[begin_norm_axis:len(old_shape)], 1) x.shape = [N, D] mean = np.mean(x, axis=1) var = np.var(x, axis=1) + epsilon output = scale * np.divide((x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1])) + beta output.shape = old_shape + x.shape = old_shape return output, mean, var -def _reference_layer_norm_grad(x, grad_y, scale, mean, var, epsilon): +def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): x_shape = x.shape - N = x_shape[0] - D = reduce(mul, x_shape, 1) / N + N = reduce(mul, x_shape[0:begin_norm_axis], 1) + D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) grad_y.shape = [N, D] x.shape = [N, D] mean.shape = [N, 1] @@ -140,7 +140,9 @@ class TestLayerNormdOp(OpTest): self.assertLessEqual(max_diff, max_relative_error, err_msg()) def test_forward_backward(self): - def test_with_place(place, shape): + def test_with_place(place, shape, begin_norm_axis=1): + assert begin_norm_axis > 0 and begin_norm_axis < len( + shape), 'begin_norm_axis must be between 0 and len(shape)-1.' # attr epsilon = 0.00001 x_shape = shape @@ -152,13 +154,13 @@ class TestLayerNormdOp(OpTest): # run forward y_out, saved_mean, var_ref = _reference_layer_norm_naive( - x_val, scale_val, bias_val, epsilon) + x_val, scale_val, bias_val, epsilon, begin_norm_axis) # for gradient test y_grad = np.random.random_sample(x_shape).astype(np.float32) x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad( - x_val, y_grad, scale_val, saved_mean, var_ref, epsilon) + x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis) scope = core.Scope() @@ -185,7 +187,8 @@ class TestLayerNormdOp(OpTest): Mean="Mean", Variance="Variance", # attrs - epsilon=epsilon) + epsilon=epsilon, + begin_norm_axis=begin_norm_axis) layer_norm_op.run(scope, place) @@ -228,7 +231,8 @@ class TestLayerNormdOp(OpTest): places.append(core.CUDAPlace(0)) for place in places: - test_with_place(place, [2, 3, 4, 5]) + test_with_place(place, [2, 3, 4, 5], begin_norm_axis=1) + test_with_place(place, [2, 3, 4, 5], begin_norm_axis=3) if __name__ == '__main__': From 87b5559cd15a28d515b16f3ad04ca9919c7edd32 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 29 Jan 2018 20:41:08 +0800 Subject: [PATCH 05/10] fix scale and bias dim --- paddle/operators/layer_norm_op.cc | 84 +++++++++---------- .../v2/fluid/tests/test_layer_norm_op.py | 16 ++-- 2 files changed, 52 insertions(+), 48 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 9e618d10d2..07ca8ac222 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -38,10 +38,6 @@ class LayerNormOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Bias"), ""); PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], 1); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], 1); auto x_dim = ctx->GetInputDim("X"); auto begin_norm_axis = ctx->Attrs().Get("begin_norm_axis"); PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(), @@ -50,6 +46,11 @@ class LayerNormOp : public framework::OperatorWithKernel { auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis); int left = static_cast(matrix_dim[0]); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], left); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], left); + ctx->SetOutputDim("Y", ctx->GetInputDim("X")); ctx->SetOutputDim("Mean", {left}); ctx->SetOutputDim("Variance", {left}); @@ -64,10 +65,10 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The input tensor"); AddInput("Scale", - "Scale is a 1-dimensional tensor of size 1 " + "Scale is a 1-dimensional tensor of size H " "that is applied to the output"); AddInput("Bias", - "Bias is a 1-dimensional tensor of size 1 " + "Bias is a 1-dimensional tensor of size H " "that is applied to the output"); AddOutput("Y", "result after normalization"); AddOutput("Mean", "Mean of the current mini batch."); @@ -110,9 +111,6 @@ class LayerNormKernel const auto &x_dims = x->dims(); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - auto scale_data = scale->data()[0]; - auto bias_data = bias->data()[0]; - auto *output = ctx.Output("Y"); auto *mean = ctx.Output("Mean"); auto *var = ctx.Output("Variance"); @@ -123,7 +121,10 @@ class LayerNormKernel auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); int left = static_cast(matrix_dim[0]); int right = static_cast(matrix_dim[1]); + auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); + auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), left, 1); + auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), left, 1); auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); @@ -138,18 +139,15 @@ class LayerNormKernel .mean() .unaryExpr(add_epslion); - auto scale_inv_std = [scale_data](T ele) { - return std::sqrt(1 / ele) * scale_data; - }; - auto sub_bias = [bias_data](T ele) { return bias_data - ele; }; + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; + // TODO(zcd): Some thinking about output_map, is it appropriate that // `output_map` and `input_map` point to the same memory. - output_map = (var_map.unaryExpr(scale_inv_std).replicate(1, right)) - .cwiseProduct(input_map) + - var_map.unaryExpr(scale_inv_std) - .cwiseProduct(mean_map) - .unaryExpr(sub_bias) - .replicate(1, right); + auto inv_std_scale = + var_map.unaryExpr(inv_std_func).cwiseProduct(scale_map); + output_map = + inv_std_scale.replicate(1, right).cwiseProduct(input_map) + + (bias_map - inv_std_scale.cwiseProduct(mean_map)).replicate(1, right); } }; @@ -165,17 +163,17 @@ class LayerNormGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Variance"), ""); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), ""); - const auto x_dims = ctx->GetInputDim("X"); - // check output if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } if (ctx->HasOutput(framework::GradVarName("Scale"))) { - ctx->SetOutputDim(framework::GradVarName("Scale"), {1}); + ctx->SetOutputDim(framework::GradVarName("Scale"), + ctx->GetInputDim("Scale")); } if (ctx->HasOutput(framework::GradVarName("Bias"))) { - ctx->SetOutputDim(framework::GradVarName("Bias"), {1}); + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Bias")); } } @@ -210,20 +208,20 @@ class LayerNormGradKernel const auto *var = ctx.Input("Variance"); const auto *scale = ctx.Input("Scale"); const auto *d_y = ctx.Input(framework::GradVarName("Y")); - auto scale_data = scale->data()[0]; const auto &x_dims = x->dims(); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]), - right = static_cast(matrix_dim[1]); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); // init output auto *d_x = ctx.Output(framework::GradVarName("X")); auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), left, 1); auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); @@ -231,36 +229,38 @@ class LayerNormGradKernel if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); - d_bias->data()[0] = d_y_map.sum(); + auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), left, 1); + d_bias_map = d_y_map.colwise().mean(); } if (d_scale) { d_scale->mutable_data(ctx.GetPlace()); - auto inv_std = [](T ele) { return std::sqrt(1 / ele); }; + auto d_scale_map = EigenMatrixMapRowMajor(d_scale->data(), left, 1); + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; // There are two equation to compute d_scale. One uses "Y" and the other // does not use "Y" - d_scale->data()[0] = + d_scale_map = ((x_map - mean_map.replicate(1, right)) - .cwiseProduct(var_map.unaryExpr(inv_std).replicate(1, right)) + .cwiseProduct( + var_map.unaryExpr(inv_std_func).replicate(1, right)) .cwiseProduct(d_y_map)) - .sum(); + .colwise() + .mean(); } if (d_x) { d_x->mutable_data(ctx.GetPlace()); auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); auto triple_product_func = [](T ele) { return ele * ele * ele; }; - auto scale_func = [scale_data](T ele) { return ele * scale_data; }; auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - auto inv_std_scale_func = [scale_data](T ele) { - return std::sqrt(1 / ele) * scale_data; - }; // dy_dx - auto dx_end = var_map.unaryExpr(inv_std_scale_func) + auto dx_end = var_map.unaryExpr(inv_std_func) + .cwiseProduct(scale_map) .replicate(1, right) .cwiseProduct(d_y_map); // dy_dmean_dx auto dx_mean = (T(-1.0) / right) * - var_map.unaryExpr(inv_std_scale_func) + var_map.unaryExpr(inv_std_func) + .cwiseProduct(scale_map) .replicate(1, right) .cwiseProduct(d_y_map) .rowwise() @@ -274,11 +274,11 @@ class LayerNormGradKernel auto dvar_end = var_map.unaryExpr(inv_std_func) .unaryExpr(triple_product_func) .cwiseProduct(dvar_end_part) + .cwiseProduct(scale_map) .replicate(1, right); - auto dx_var = (T(-1.0) / right) * - (x_map - mean_map.replicate(1, right)) - .cwiseProduct(dvar_end) - .unaryExpr(scale_func); + auto dx_var = + (T(-1.0) / right) * + (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); d_x_map = dx_end + dx_mean + dx_var; } diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index 8ce327436f..9264cf4b79 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -39,8 +39,9 @@ def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): x.shape = [N, D] mean = np.mean(x, axis=1) var = np.var(x, axis=1) + epsilon - output = scale * np.divide((x - mean.reshape([N, 1])), - (np.sqrt(var)).reshape([N, 1])) + beta + output = scale.reshape([1, D]) * np.divide( + (x - mean.reshape([N, 1])), + (np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D]) output.shape = old_shape x.shape = old_shape return output, mean, var @@ -55,8 +56,10 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): mean.shape = [N, 1] var.shape = [N, 1] - d_scale = np.sum(grad_y).reshape([1, ]) - d_bias = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y).reshape([1, ]) + d_scale = np.sum(grad_y, axis=1).reshape([1, D]) + d_bias = scale.reshape([1, D]) * np.sum(( + (x - mean) * np.sqrt(1 / var)) * grad_y, + axis=1).reshape([1, D]) dx_end = np.sqrt(1.0 / var) * grad_y @@ -69,7 +72,7 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): d_std = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape([N, 1]) * ( 1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) - grad_x = scale * (dx_end + d_mean + d_std) + grad_x = scale.reshape([1, D]) * (dx_end + d_mean + d_std) grad_y.shape = x_shape x.shape = x_shape @@ -146,7 +149,8 @@ class TestLayerNormdOp(OpTest): # attr epsilon = 0.00001 x_shape = shape - scale_shape = [1] + D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) + scale_shape = [D] np.random.random(123) x_val = np.random.random_sample(x_shape).astype(np.float32) scale_val = np.random.random_sample(scale_shape).astype(np.float32) From 7e0d21de6d3352c1238d35d2586f40e48b6da27f Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 30 Jan 2018 11:11:04 +0800 Subject: [PATCH 06/10] fix scale and bias dim --- paddle/operators/layer_norm_op.cc | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 07ca8ac222..125ac9f53f 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -123,8 +123,8 @@ class LayerNormKernel int right = static_cast(matrix_dim[1]); auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); - auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), left, 1); - auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), left, 1); + auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); @@ -143,11 +143,11 @@ class LayerNormKernel // TODO(zcd): Some thinking about output_map, is it appropriate that // `output_map` and `input_map` point to the same memory. - auto inv_std_scale = - var_map.unaryExpr(inv_std_func).cwiseProduct(scale_map); - output_map = - inv_std_scale.replicate(1, right).cwiseProduct(input_map) + - (bias_map - inv_std_scale.cwiseProduct(mean_map)).replicate(1, right); + auto inv_std_scale = var_map.unaryExpr(inv_std_func); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std_scale.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)) - + bias_map.replicate(left, 1); } }; @@ -221,7 +221,7 @@ class LayerNormGradKernel auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), left, 1); + auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), 1, right); auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); @@ -229,12 +229,13 @@ class LayerNormGradKernel if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); - auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), left, 1); + auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), 1, right); d_bias_map = d_y_map.colwise().mean(); } if (d_scale) { d_scale->mutable_data(ctx.GetPlace()); - auto d_scale_map = EigenMatrixMapRowMajor(d_scale->data(), left, 1); + auto d_scale_map = + EigenMatrixMapRowMajor(d_scale->data(), 1, right); auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; // There are two equation to compute d_scale. One uses "Y" and the other // does not use "Y" @@ -254,15 +255,15 @@ class LayerNormGradKernel auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; // dy_dx auto dx_end = var_map.unaryExpr(inv_std_func) - .cwiseProduct(scale_map) .replicate(1, right) - .cwiseProduct(d_y_map); + .cwiseProduct(d_y_map) + .cwiseProduct(scale_map.replicate(left, 1)); // dy_dmean_dx auto dx_mean = (T(-1.0) / right) * var_map.unaryExpr(inv_std_func) - .cwiseProduct(scale_map) .replicate(1, right) .cwiseProduct(d_y_map) + .cwiseProduct(scale_map.replicate(left, 1)) .rowwise() .sum() .replicate(1, right); @@ -274,8 +275,8 @@ class LayerNormGradKernel auto dvar_end = var_map.unaryExpr(inv_std_func) .unaryExpr(triple_product_func) .cwiseProduct(dvar_end_part) - .cwiseProduct(scale_map) - .replicate(1, right); + .replicate(1, right) + .cwiseProduct(scale_map.replicate(left, 1)); auto dx_var = (T(-1.0) / right) * (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); From 09570b48dd40a52009b66e93af6108cb308e361d Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 30 Jan 2018 15:22:52 +0800 Subject: [PATCH 07/10] layer norm -> scale + bias --- paddle/operators/layer_norm_op.cc | 19 ++++++------- .../v2/fluid/tests/test_layer_norm_op.py | 27 ++++++++++--------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 125ac9f53f..5821afe9f6 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -45,11 +45,12 @@ class LayerNormOp : public framework::OperatorWithKernel { auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis); int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], left); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], left); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right); ctx->SetOutputDim("Y", ctx->GetInputDim("X")); ctx->SetOutputDim("Mean", {left}); @@ -143,10 +144,10 @@ class LayerNormKernel // TODO(zcd): Some thinking about output_map, is it appropriate that // `output_map` and `input_map` point to the same memory. - auto inv_std_scale = var_map.unaryExpr(inv_std_func); + auto inv_std = var_map.unaryExpr(inv_std_func); output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std_scale.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)) - + .cwiseProduct(inv_std.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)) + bias_map.replicate(left, 1); } }; @@ -230,7 +231,7 @@ class LayerNormGradKernel if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), 1, right); - d_bias_map = d_y_map.colwise().mean(); + d_bias_map = d_y_map.colwise().sum(); } if (d_scale) { d_scale->mutable_data(ctx.GetPlace()); @@ -245,7 +246,7 @@ class LayerNormGradKernel var_map.unaryExpr(inv_std_func).replicate(1, right)) .cwiseProduct(d_y_map)) .colwise() - .mean(); + .sum(); } if (d_x) { @@ -269,14 +270,14 @@ class LayerNormGradKernel .replicate(1, right); // dy_var_dx auto dvar_end_part = (x_map - mean_map.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)) .cwiseProduct(d_y_map) .rowwise() .sum(); auto dvar_end = var_map.unaryExpr(inv_std_func) .unaryExpr(triple_product_func) .cwiseProduct(dvar_end_part) - .replicate(1, right) - .cwiseProduct(scale_map.replicate(left, 1)); + .replicate(1, right); auto dx_var = (T(-1.0) / right) * (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index 9264cf4b79..d27d1d8138 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -49,35 +49,38 @@ def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): x_shape = x.shape + scale_shape = scale.shape N = reduce(mul, x_shape[0:begin_norm_axis], 1) D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) grad_y.shape = [N, D] x.shape = [N, D] mean.shape = [N, 1] var.shape = [N, 1] + scale.shape = [1, D] - d_scale = np.sum(grad_y, axis=1).reshape([1, D]) - d_bias = scale.reshape([1, D]) * np.sum(( - (x - mean) * np.sqrt(1 / var)) * grad_y, - axis=1).reshape([1, D]) + d_bias = np.sum(grad_y, axis=0).reshape([1, D]) + d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y, + axis=0).reshape([1, D]) - dx_end = np.sqrt(1.0 / var) * grad_y + dx_end = scale * np.sqrt(1.0 / var) * grad_y - d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y, axis=1).reshape([N, 1]) + d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape( + [N, 1]) # d_mean_1 = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape( # [N, 1]) * (-1.0 / D * np.sqrt(1.0 / var) * # np.sum(x - mean, axis=1).reshape([N, 1])).reshape([N, 1]) - d_mean = 1.0 / D * (d_mean_0) + d_mean = 1.0 / D * d_mean_0 - d_std = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape([N, 1]) * ( - 1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) + d_std = np.sum( + -1.0 / var * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * ( + 1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) - grad_x = scale.reshape([1, D]) * (dx_end + d_mean + d_std) + grad_x = dx_end + d_mean + d_std grad_y.shape = x_shape x.shape = x_shape - - return grad_x, d_bias, d_scale + scale.shape = scale_shape + return grad_x, d_scale, d_bias def create_or_get_tensor(scope, var_name, var, place): From 263e01970d4f1923a5ee92e8d9b615a529bfb29e Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 30 Jan 2018 19:43:26 +0800 Subject: [PATCH 08/10] follow comments --- paddle/operators/layer_norm_op.cc | 197 ++++++++++++++++++++---------- 1 file changed, 133 insertions(+), 64 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 5821afe9f6..1c6d2ae4d0 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -33,29 +33,35 @@ class LayerNormOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), ""); - PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); - PADDLE_ENFORCE(ctx->HasInput("Bias"), ""); - PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), + "Output(Y) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Mean"), + "Output(Mean) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Variance"), + "Output(Variance) of LayerNormOp should not be null."); auto x_dim = ctx->GetInputDim("X"); auto begin_norm_axis = ctx->Attrs().Get("begin_norm_axis"); PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(), - "'begin_norm_axis' must be less than the rank of X"); + "'begin_norm_axis' must be less than the rank of X."); auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis); int left = static_cast(matrix_dim[0]); int right = static_cast(matrix_dim[1]); - - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right); + if (ctx->HasInput("Scale")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right); + } + if (ctx->HasInput("Bias")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right); + } ctx->SetOutputDim("Y", ctx->GetInputDim("X")); ctx->SetOutputDim("Mean", {left}); ctx->SetOutputDim("Variance", {left}); - ctx->ShareLoD("X", "Y"); } }; @@ -64,18 +70,26 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { public: LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The input tensor"); + AddInput("X", "(LoDTensor) The input tensor."); AddInput("Scale", - "Scale is a 1-dimensional tensor of size H " - "that is applied to the output"); + "(Tensor, optional) Scale is a 1-dimensional tensor of size " + "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." + "It is applied to the output.") + .AsDispensable(); AddInput("Bias", - "Bias is a 1-dimensional tensor of size H " - "that is applied to the output"); - AddOutput("Y", "result after normalization"); - AddOutput("Mean", "Mean of the current mini batch."); - AddOutput("Variance", "Variance of the current mini batch."); - - AddAttr("epsilon", "") + "(Tensor, optional) Bias is a 1-dimensional tensor of size " + "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." + "It is applied to the output.") + .AsDispensable(); + AddOutput("Y", "(LoDTensor) Result after normalization."); + AddOutput("Mean", "(Tensor) Mean of the current mini batch.") + .AsIntermediate(); + AddOutput("Variance", "(Tensor) Variance of the current mini batch.") + .AsIntermediate(); + + AddAttr("epsilon", + "(float, default 1e-5) Constant for " + "numerical stability") .SetDefault(1e-5) .AddCustomChecker([](const float &epsilon) { PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, @@ -83,7 +97,9 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr("begin_norm_axis", "(int default:1), the " - "axis of `begin_norm_axis ... Rank(X) - 1` will be normalized") + "axis of `begin_norm_axis ... Rank(X) - 1` will be " + "normalized. `begin_norm_axis` splits the tensor(`X`) to a " + "matrix [N,H].") .SetDefault(1) .AddCustomChecker([](const int &begin_norm_axis) { PADDLE_ENFORCE_GT(begin_norm_axis, 0, @@ -124,8 +140,7 @@ class LayerNormKernel int right = static_cast(matrix_dim[1]); auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); - auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), 1, right); - auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); + auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); @@ -141,14 +156,32 @@ class LayerNormKernel .unaryExpr(add_epslion); auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - // TODO(zcd): Some thinking about output_map, is it appropriate that // `output_map` and `input_map` point to the same memory. auto inv_std = var_map.unaryExpr(inv_std_func); - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)) + - bias_map.replicate(left, 1); + if (scale && bias) { + auto scale_map = + ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)) + + bias_map.replicate(left, 1); + } else if (scale) { + auto scale_map = + ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)); + } else if (bias) { + auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)) + + bias_map.replicate(left, 1); + } else { + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)); + } } }; @@ -158,11 +191,16 @@ class LayerNormGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { // check input - PADDLE_ENFORCE(ctx->HasInput("X")); - PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); - PADDLE_ENFORCE(ctx->HasInput("Mean"), ""); - PADDLE_ENFORCE(ctx->HasInput("Variance"), ""); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), ""); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Mean"), + "Input(Mean) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Variance"), + "Input(Variance) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) of LayerNormOp should not be null."); // check output if (ctx->HasOutput(framework::GradVarName("X"))) { @@ -222,7 +260,6 @@ class LayerNormGradKernel auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), 1, right); auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); @@ -254,35 +291,67 @@ class LayerNormGradKernel auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); auto triple_product_func = [](T ele) { return ele * ele * ele; }; auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - // dy_dx - auto dx_end = var_map.unaryExpr(inv_std_func) - .replicate(1, right) - .cwiseProduct(d_y_map) - .cwiseProduct(scale_map.replicate(left, 1)); - // dy_dmean_dx - auto dx_mean = (T(-1.0) / right) * - var_map.unaryExpr(inv_std_func) - .replicate(1, right) - .cwiseProduct(d_y_map) - .cwiseProduct(scale_map.replicate(left, 1)) - .rowwise() - .sum() - .replicate(1, right); - // dy_var_dx - auto dvar_end_part = (x_map - mean_map.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)) - .cwiseProduct(d_y_map) - .rowwise() - .sum(); - auto dvar_end = var_map.unaryExpr(inv_std_func) - .unaryExpr(triple_product_func) - .cwiseProduct(dvar_end_part) - .replicate(1, right); - auto dx_var = - (T(-1.0) / right) * - (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); - - d_x_map = dx_end + dx_mean + dx_var; + // TODO(zcd): these code can be refined + if (d_scale) { + auto scale_map = + ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + // dy_dx + auto dx_end = var_map.unaryExpr(inv_std_func) + .replicate(1, right) + .cwiseProduct(d_y_map) + .cwiseProduct(scale_map.replicate(left, 1)); + // dy_dmean_dx + auto dx_mean = (T(-1.0) / right) * + var_map.unaryExpr(inv_std_func) + .replicate(1, right) + .cwiseProduct(d_y_map) + .cwiseProduct(scale_map.replicate(left, 1)) + .rowwise() + .sum() + .replicate(1, right); + // dy_var_dx + auto dvar_end_part = (x_map - mean_map.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)) + .cwiseProduct(d_y_map) + .rowwise() + .sum(); + auto dvar_end = var_map.unaryExpr(inv_std_func) + .unaryExpr(triple_product_func) + .cwiseProduct(dvar_end_part) + .replicate(1, right); + auto dx_var = + (T(-1.0) / right) * + (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); + + d_x_map = dx_end + dx_mean + dx_var; + } else { + // dy_dx + auto dx_end = var_map.unaryExpr(inv_std_func) + .replicate(1, right) + .cwiseProduct(d_y_map); + // dy_dmean_dx + auto dx_mean = (T(-1.0) / right) * + var_map.unaryExpr(inv_std_func) + .replicate(1, right) + .cwiseProduct(d_y_map) + .rowwise() + .sum() + .replicate(1, right); + // dy_var_dx + auto dvar_end_part = (x_map - mean_map.replicate(1, right)) + .cwiseProduct(d_y_map) + .rowwise() + .sum(); + auto dvar_end = var_map.unaryExpr(inv_std_func) + .unaryExpr(triple_product_func) + .cwiseProduct(dvar_end_part) + .replicate(1, right); + auto dx_var = + (T(-1.0) / right) * + (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); + + d_x_map = dx_end + dx_mean + dx_var; + } } } }; From acb907878a0e3a66f56371aa82088233f6dc9aaf Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 30 Jan 2018 20:33:54 +0800 Subject: [PATCH 09/10] refine unit test --- .../v2/fluid/tests/test_layer_norm_op.py | 118 +++++++++--------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index d27d1d8138..ac94dfb92a 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -21,29 +21,19 @@ from paddle.v2.fluid.op import Operator from paddle.v2.fluid.framework import grad_var_name -def get_backward_op(scope, op, no_grad_set): - backward_op = core.Operator.backward(op, no_grad_set) - for input in backward_op.input_vars(): - var = scope.var(input) - var.get_tensor() - for output in backward_op.output_vars(): - var = scope.var(output) - var.get_tensor() - return backward_op - - def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): - old_shape = x.shape - N = reduce(mul, old_shape[0:begin_norm_axis], 1) - D = reduce(mul, old_shape[begin_norm_axis:len(old_shape)], 1) + x_shape = x.shape + N = reduce(mul, x_shape[0:begin_norm_axis], 1) + D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) x.shape = [N, D] + mean = np.mean(x, axis=1) var = np.var(x, axis=1) + epsilon output = scale.reshape([1, D]) * np.divide( (x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D]) - output.shape = old_shape - x.shape = old_shape + + x.shape, output.shape = x_shape, x_shape return output, mean, var @@ -52,27 +42,25 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): scale_shape = scale.shape N = reduce(mul, x_shape[0:begin_norm_axis], 1) D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) - grad_y.shape = [N, D] - x.shape = [N, D] - mean.shape = [N, 1] - var.shape = [N, 1] + x.shape, grad_y.shape = [N, D], [N, D] + var.shape, mean.shape = [N, 1], [N, 1] scale.shape = [1, D] + # d_bias d_bias = np.sum(grad_y, axis=0).reshape([1, D]) + # d_scale d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y, axis=0).reshape([1, D]) - + # dx dx_end = scale * np.sqrt(1.0 / var) * grad_y - d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape( [N, 1]) # d_mean_1 = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape( # [N, 1]) * (-1.0 / D * np.sqrt(1.0 / var) * # np.sum(x - mean, axis=1).reshape([N, 1])).reshape([N, 1]) d_mean = 1.0 / D * d_mean_0 - d_std = np.sum( - -1.0 / var * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * ( + -(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * ( 1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) grad_x = dx_end + d_mean + d_std @@ -83,6 +71,17 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): return grad_x, d_scale, d_bias +def get_backward_op(scope, op, no_grad_set): + backward_op = core.Operator.backward(op, no_grad_set) + for input in backward_op.input_vars(): + var = scope.var(input) + var.get_tensor() + for output in backward_op.output_vars(): + var = scope.var(output) + var.get_tensor() + return backward_op + + def create_or_get_tensor(scope, var_name, var, place): tensor = scope.var(var_name).get_tensor() if var is not None: @@ -145,8 +144,9 @@ class TestLayerNormdOp(OpTest): self.assertLessEqual(max_diff, max_relative_error, err_msg()) - def test_forward_backward(self): + def check_forward_backward(self, shape, begin_norm_axis): def test_with_place(place, shape, begin_norm_axis=1): + # setUp assert begin_norm_axis > 0 and begin_norm_axis < len( shape), 'begin_norm_axis must be between 0 and len(shape)-1.' # attr @@ -158,30 +158,35 @@ class TestLayerNormdOp(OpTest): x_val = np.random.random_sample(x_shape).astype(np.float32) scale_val = np.random.random_sample(scale_shape).astype(np.float32) bias_val = np.random.random_sample(scale_shape).astype(np.float32) + y_grad = np.random.random_sample(x_shape).astype(np.float32) # run forward y_out, saved_mean, var_ref = _reference_layer_norm_naive( x_val, scale_val, bias_val, epsilon, begin_norm_axis) + naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref} - # for gradient test - y_grad = np.random.random_sample(x_shape).astype(np.float32) - + # get gradient x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad( x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis) + naive_grad = { + "X": x_grad_ref, + "Scale": scale_grad_ref, + "Bias": bias_grad_ref + } scope = core.Scope() # create input - x_tensor = create_or_get_tensor(scope, "X", x_val, place) - scale_tensor = create_or_get_tensor(scope, "Scale", scale_val, - place) - bias_tensor = create_or_get_tensor(scope, "Bias", bias_val, place) + input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val} + for i_name in input_map: + create_or_get_tensor(scope, i_name, input_map[i_name], place) # create output - y_tensor = create_or_get_tensor(scope, "Y", None, place) - mean_tensor = create_or_get_tensor(scope, "Mean", None, place) - variance_tensor = create_or_get_tensor(scope, "Variance", None, - place) + output_map = {"Y": None, "Mean": None, "Variance": None} + output_tensor = {} + for o_name in output_map: + output_tensor[o_name] = create_or_get_tensor( + scope, o_name, output_map[o_name], place) layer_norm_op = Operator( "layer_norm", @@ -200,13 +205,10 @@ class TestLayerNormdOp(OpTest): layer_norm_op.run(scope, place) # check forward result - if isinstance(place, core.CUDAPlace): - atol = 5e-2 - else: - atol = 1e-4 - self.__assert_close(y_tensor, y_out, "Y", atol) - self.__assert_close(mean_tensor, saved_mean, "Mean", atol) - self.__assert_close(variance_tensor, var_ref, "Variance", atol) + atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4 + for o_tensor in output_tensor: + self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor], + o_tensor, atol) # run backward layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set()) @@ -216,30 +218,28 @@ class TestLayerNormdOp(OpTest): feed_dict={"Y": y_grad}) layer_norm_op_grad.run(scope, place) - x_grad_tensor = create_or_get_tensor(scope, - grad_var_name("X"), None, - place) - scale_grad_tensor = create_or_get_tensor(scope, - grad_var_name("Scale"), - None, place) - bias_grad_tensor = create_or_get_tensor(scope, - grad_var_name("Bias"), None, - place) + # get output + grad_tensor = {} + for o_name in naive_grad: + grad_tensor[o_name] = x_ = create_or_get_tensor( + scope, grad_var_name(o_name), None, place) # check gradient output - self.__assert_grad_close(x_grad_tensor, x_grad_ref, "x_grad", place) - self.__assert_grad_close(scale_grad_tensor, scale_grad_ref, - "scale_grad", place) - self.__assert_grad_close(bias_grad_tensor, bias_grad_ref, - "bias_grad", place) + for o_grad in naive_grad: + self.__assert_grad_close(grad_tensor[o_grad], + naive_grad[o_grad], o_grad + "@GRAD", + place) places = [core.CPUPlace()] if core.is_compile_gpu() and core.op_support_gpu("layer_norm"): places.append(core.CUDAPlace(0)) for place in places: - test_with_place(place, [2, 3, 4, 5], begin_norm_axis=1) - test_with_place(place, [2, 3, 4, 5], begin_norm_axis=3) + test_with_place(place, shape, begin_norm_axis) + + def test_check_forward_backward(self): + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3) if __name__ == '__main__': From e5058ed1f14b8fe16be8055bb819e0a101cf2ade Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 30 Jan 2018 22:13:06 +0800 Subject: [PATCH 10/10] Add unit test for with_scale and with_bias --- .../paddle/v2/fluid/tests/test_layer_norm_op.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index ac94dfb92a..7d5dc7d1a6 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -54,10 +54,7 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): # dx dx_end = scale * np.sqrt(1.0 / var) * grad_y d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape( - [N, 1]) - # d_mean_1 = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape( - # [N, 1]) * (-1.0 / D * np.sqrt(1.0 / var) * - # np.sum(x - mean, axis=1).reshape([N, 1])).reshape([N, 1]) + [N, 1]) # the second part equals to zero. d_mean = 1.0 / D * d_mean_0 d_std = np.sum( -(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * ( @@ -237,10 +234,19 @@ class TestLayerNormdOp(OpTest): for place in places: test_with_place(place, shape, begin_norm_axis) - def test_check_forward_backward(self): + def test_check_forward_backward_with_scale_and_bias(self): self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1) self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3) + def test_check_forward_backward_with_scale(self): + pass # TODO(zcd) + + def test_check_forward_backward_with_bias(self): + pass # TODO(zcd) + + def test_check_forward_backward(self): + pass # TODO(zcd) + if __name__ == '__main__': unittest.main()