[NGraph] Enable ngraph layer_norm operator (#17599)

* Enable ngraph layer_norm operator test=develop * Disable/Enable cuda, new unit-test test=develop * Fix use_cudnn test=develop * Fixed test_layer test, new funciton is added test=develop * set use_cudnn by default test=develop
6 years ago · 39bc8a55a4
parent 993c703bcc
commit 39bc8a55a4
3 changed files with 230 additions and 1 deletions
--- a/paddle/fluid/operators/ngraph/ops/layer_norm_op.h
+++ b/paddle/fluid/operators/ngraph/ops/layer_norm_op.h
@ -0,0 +1,195 @@
+/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+std::shared_ptr<ngraph::Node> reshape_reduction(
+    std::shared_ptr<ngraph::Node> node, const ngraph::Shape shape,
+    int begin_norm_axis) {
+  ngraph::Shape keepdims_shape(shape.begin(), shape.begin() + begin_norm_axis);
+  return paddle::platform::NgReshaper(node, keepdims_shape);
+}
+
+std::shared_ptr<ngraph::Node> broadcast_reduction(
+    std::shared_ptr<ngraph::Node> node, const ngraph::Shape shape,
+    int begin_norm_axis) {
+  ngraph::AxisSet axis_set;
+  for (size_t i = begin_norm_axis; i < shape.size(); ++i) axis_set.insert(i);
+  auto reshape = reshape_reduction(node, shape, begin_norm_axis);
+  return std::make_shared<ngraph::op::Broadcast>(reshape, shape, axis_set);
+}
+
+std::shared_ptr<ngraph::Node> reshape_bias_scale(
+    std::shared_ptr<ngraph::Node> node, const ngraph::Shape shape,
+    int begin_norm_axis) {
+  ngraph::Shape keepdims_shape(shape.begin() + begin_norm_axis, shape.end());
+  return paddle::platform::NgReshaper(node, keepdims_shape);
+}
+
+std::shared_ptr<ngraph::Node> broadcast_bias_scale(
+    std::shared_ptr<ngraph::Node> node, const ngraph::Shape shape,
+    int begin_norm_axis) {
+  auto reshape = reshape_bias_scale(node, shape, begin_norm_axis);
+  ngraph::AxisSet axis_set;
+  for (int i = 0; i < begin_norm_axis; ++i) axis_set.insert(i);
+  return std::make_shared<ngraph::op::Broadcast>(reshape, shape, axis_set);
+}
+
+std::shared_ptr<ngraph::Node> flatten(const std::shared_ptr<ngraph::Node>& node,
+                                      bool insert_leading_one = false) {
+  size_t out = 1;
+  for (auto s : node->get_shape()) out *= s;
+  if (insert_leading_one) {
+    return paddle::platform::NgReshaper(node, ngraph::Shape{1, out});
+  } else {
+    return paddle::platform::NgReshaper(node, ngraph::Shape{out});
+  }
+}
+
+static void BuildLayerNormNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const auto begin_norm_axis = op_attrs.Get<int>("begin_norm_axis");
+  const auto epsilon = op_attrs.Get<float>("epsilon");
+
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
+  auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map);
+
+  auto shape = x->get_shape();
+  std::vector<size_t> reduction_axes(shape.size() - begin_norm_axis);
+  std::iota(reduction_axes.begin(), reduction_axes.end(), begin_norm_axis);
+
+  auto mean = ngraph::builder::mean(x, reduction_axes);
+  auto broadcast_mean = broadcast_reduction(mean, shape, begin_norm_axis);
+
+  auto delta = x - broadcast_mean;
+  auto variance = ngraph::builder::mean(delta * delta, reduction_axes);
+
+  auto eps = paddle::platform::CreateConstant(variance->get_element_type(),
+                                              variance->get_shape(), {epsilon});
+
+  auto stddev = std::make_shared<ngraph::op::Sqrt>(variance + eps);
+  auto broadcast_stddev = broadcast_reduction(stddev, shape, begin_norm_axis);
+
+  auto norm = delta / broadcast_stddev;
+
+  if (scale) {
+    auto broadcast_scale = broadcast_bias_scale(scale, shape, begin_norm_axis);
+    norm = norm * broadcast_scale;
+  }
+  if (bias) {
+    auto broadcast_bias = broadcast_bias_scale(bias, shape, begin_norm_axis);
+    norm = norm + broadcast_bias;
+  }
+  mean = flatten(mean);
+  variance = flatten(variance);
+  paddle::platform::SetOutputNode(op, "Y", norm, ngb_node_map);
+  paddle::platform::SetOutputNode(op, "Mean", mean, ngb_node_map);
+  paddle::platform::SetOutputNode(op, "Variance", variance, ngb_node_map);
+}
+
+static void BuildLayerNormGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const auto begin_norm_axis = op_attrs.Get<int>("begin_norm_axis");
+  const auto epsilon = op_attrs.Get<float>("epsilon");
+
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto mean = paddle::platform::GetInputNode(op, "Mean", ngb_node_map);
+  auto variance = paddle::platform::GetInputNode(op, "Variance", ngb_node_map);
+  auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
+  auto dy = paddle::platform::GetInputNode(op, framework::GradVarName("Y"),
+                                           ngb_node_map);
+
+  auto dx = paddle::platform::GetOutputNode(op, framework::GradVarName("X"),
+                                            ngb_node_map);
+  auto dscale = paddle::platform::GetOutputNode(
+      op, framework::GradVarName("Scale"), ngb_node_map);
+  auto dbias = paddle::platform::GetOutputNode(
+      op, framework::GradVarName("Bias"), ngb_node_map);
+
+  auto shape = x->get_shape();
+
+  auto broadcast_mean = broadcast_reduction(mean, shape, begin_norm_axis);
+
+  auto delta = x - broadcast_mean;
+  auto eps = paddle::platform::CreateConstant(variance->get_element_type(),
+                                              variance->get_shape(), {epsilon});
+
+  auto stddev = std::make_shared<ngraph::op::Sqrt>(variance + eps);
+  auto broadcast_stddev = broadcast_reduction(stddev, shape, begin_norm_axis);
+
+  auto norm = delta / broadcast_stddev;
+
+  if (dbias) {
+    std::vector<size_t> reduction_axes(begin_norm_axis);
+    std::iota(reduction_axes.begin(), reduction_axes.end(), 0);
+    auto sum_dy = std::make_shared<ngraph::op::Sum>(dy, reduction_axes);
+    paddle::platform::SetOutputNode(op, framework::GradVarName("Bias"),
+                                    flatten(sum_dy), ngb_node_map);
+  }
+  if (dscale) {
+    std::vector<size_t> reduction_axes(begin_norm_axis);
+    std::iota(reduction_axes.begin(), reduction_axes.end(), 0);
+    auto sum_dy = std::make_shared<ngraph::op::Sum>(dy * norm, reduction_axes);
+    paddle::platform::SetOutputNode(op, framework::GradVarName("Scale"),
+                                    flatten(sum_dy), ngb_node_map);
+  }
+
+  if (dx) {
+    std::shared_ptr<ngraph::Node> dx_end = dy / broadcast_stddev;
+    if (dscale)
+      dx_end = dx_end * broadcast_bias_scale(scale, shape, begin_norm_axis);
+
+    std::vector<size_t> reduction_axes(shape.size() - begin_norm_axis);
+    std::iota(reduction_axes.begin(), reduction_axes.end(), begin_norm_axis);
+
+    auto dx_mean = broadcast_reduction(
+        ngraph::builder::mean(-dx_end, reduction_axes), shape, begin_norm_axis);
+
+    auto dx_std =
+        norm * broadcast_reduction(
+                   ngraph::builder::mean(-dx_end * norm, reduction_axes), shape,
+                   begin_norm_axis);
+
+    paddle::platform::SetOutputNode(op, framework::GradVarName("X"),
+                                    dx_end + dx_mean + dx_std, ngb_node_map);
+  }
+}
+
+REGISTER_NG_OP(layer_norm, BuildLayerNormNode);
+REGISTER_NG_OP(layer_norm_grad, BuildLayerNormGradNode);
+
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/fluid/tests/unittests/ngraph/test_layer_norm_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_layer_norm_ngraph_op.py
@ -0,0 +1,30 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest, sys
+sys.path.append("../")
+from test_layer_norm_op import TestLayerNormdOp
+
+
+class TestLayerNormNGRAPHOp(TestLayerNormdOp):
+    def setUp(self):
+        super(TestLayerNormNGRAPHOp, self).setUp()
+        self.use_cudnn = False
+
+
+del TestLayerNormdOp
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@ -72,6 +72,9 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):


 class TestLayerNormdOp(unittest.TestCase):
+    def setUp(self):
+        self.use_cudnn = True
+
    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)

@ -160,7 +163,8 @@ class TestLayerNormdOp(unittest.TestCase):
                self.__assert_close(bias_grad, out[5], "bias_grad")

        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "layer_norm") and self.use_cudnn:
            places.append(core.CUDAPlace(0))

        for place in places: