Enhance fused_elementwise_activation_op (#12837)

* Enhance the function of fused_elementwise_activation_op * enhance unit test * Clean Code And Add Doc * Add compound functors * Fix doc and enhance unit test * define Dx and Dy for d_binary_func * add mul_scale * add mul_scale * add elementwise_mul * code refine * code refine * add doc * add AsIntermediate
7 years ago · 3bd1d22a7d
parent a615ad46e4
commit 3bd1d22a7d
7 changed files with 1897 additions and 1257 deletions
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
--- a/paddle/fluid/operators/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
--- a/paddle/fluid/operators/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
--- a/paddle/fluid/operators/math/compound_functors.h
+++ b/paddle/fluid/operators/math/compound_functors.h
@ -0,0 +1,185 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, typename BinaryFunctor, typename UnaryFunctor>
+struct BinaryCompoundFunctor {
+  BinaryCompoundFunctor(const BinaryFunctor func1, const UnaryFunctor func2)
+      : func1_(func1), func2_(func2) {}
+  // Z = BinaryFunctor(X, UnaryFunctor(Y))
+
+  inline HOSTDEVICE T GetOut(T x, T y) { return func1_(x, func2_(y)); }
+
+  inline HOSTDEVICE T GetOutUseIntermediateOut(T x, T intermediat_out) {
+    return func1_(x, intermediat_out);
+  }
+
+  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return func2_(y); }
+
+  BinaryFunctor func1_;
+  UnaryFunctor func2_;
+};
+
+template <typename T, typename UnaryFunctor, typename BinaryFunctor>
+struct UnaryCompoundFunctor {
+  UnaryCompoundFunctor(const UnaryFunctor func1, const BinaryFunctor func2)
+      : func1_(func1), func2_(func2) {}
+  // Z = UnaryFunctor(BinaryFunctor(X, Y))
+
+  inline HOSTDEVICE T GetOut(T x, T y) { return func1_(func2_(x, y)); }
+
+  inline HOSTDEVICE T GetOutUseIntermediateOut(T x, T intermediat_out) {
+    return func1_(intermediat_out);
+  }
+
+  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return func2_(x, y); }
+
+  UnaryFunctor func1_;
+  BinaryFunctor func2_;
+};
+
+// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get
+// the dx, one is to use the 'out', and the other is not to use it.
+// the former method will save the time of recomputing the
+// 'out', but it must occupy the memory to store the 'out'.
+// While the later method can avoid occupying this memory,
+// but it must recompute the 'out'.
+template <typename T, typename DBinaryFun, typename UnaryFun>
+struct BinaryCompoundGradDxFunctor {
+  BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun)
+      : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    return dout * d_binary_fun_.Dx(x, unary_fun_(y));
+  }
+
+  inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) {
+    return dout * d_binary_fun_.Dx(x, intermediate_out);
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+};
+
+template <typename T, typename DBinaryFun, typename UnaryFun,
+          typename DUnaryFun>
+struct BinaryCompoundGradDyFunctor {
+  BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun,
+                              const DUnaryFun &d_unary_fun)
+      : d_binary_fun_(d_binary_fun),
+        unary_fun_(unary_fun),
+        d_unary_fun_(d_unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_(y);
+  }
+
+  inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) {
+    return dout * d_binary_fun_.Dy(x, intermediate_out) *
+           d_unary_fun_(y, intermediate_out);
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+  DUnaryFun d_unary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDxFunctor {
+  UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_.Dx(x, y);
+  }
+
+  inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(intermediate_out);
+    } else {
+      base = dout * d_unary_fun_(intermediate_out, out);
+    }
+    return base * d_binary_fun_.Dx(x, y);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDyFunctor {
+  UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_.Dy(x, y);
+  }
+
+  inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(intermediate_out);
+    } else {
+      base = dout * d_unary_fun_(intermediate_out, out);
+    }
+    return base * d_binary_fun_.Dy(x, y);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/functors.h
+++ b/paddle/fluid/operators/math/functors.h
@ -18,6 +18,19 @@ namespace paddle {
 namespace operators {
 namespace math {

+// MulFunctor
+template <typename T>
+struct MulFunctor {
+  // out = x * y;
+  inline HOSTDEVICE T operator()(T x, T y) { return x * y; }
+};
+
+template <typename T>
+struct MulGradFunctor {
+  inline HOSTDEVICE T Dx(T x, T y) { return y; }
+  inline HOSTDEVICE T Dy(T x, T y) { return x; }
+};
+
 // AddFunctor
 template <typename T>
 struct AddFunctor {
@ -27,9 +40,8 @@ struct AddFunctor {

 template <typename T>
 struct AddGradFunctor {
-  inline HOSTDEVICE T operator()(T x, T y) { return 1; }
-
-  inline HOSTDEVICE T operator()(T x, T y, T out) const { return 1; }
+  inline HOSTDEVICE T Dx(T x, T y) { return 1; }
+  inline HOSTDEVICE T Dy(T x, T y) { return 1; }
 };

 template <typename T>
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@ -47,7 +47,8 @@ def get_numeric_gradient(place,
                         input_to_check,
                         output_names,
                         delta=0.005,
-                         in_place=False):
+                         in_place=False,
+                         sum_outputs=None):
    # FIXME: change this method by compile time concepts
    set_input(scope, op, inputs, place)

@ -58,9 +59,11 @@ def get_numeric_gradient(place,
        sum = []
        op.run(scope, place)
        for output_name in output_names:
+            if sum_outputs and output_name not in sum_outputs:
+                continue
            sum.append(
                np.array(scope.find_var(output_name).get_tensor()).mean())
-        return np.array(sum).mean()
+        return np.array(sum).sum() / len(output_names)

    tensor_to_check = scope.find_var(input_to_check).get_tensor()
    tensor_size = product(tensor_to_check.shape())
@ -396,13 +399,14 @@ class OpTest(unittest.TestCase):
                   numeric_grad_delta=0.005,
                   in_place=False,
                   max_relative_error=0.005,
-                   user_defined_grads=None):
+                   user_defined_grads=None,
+                   sum_outputs=None):
        places = self._get_places()
        for place in places:
            self.check_grad_with_place(place, inputs_to_check, output_names,
                                       no_grad_set, numeric_grad_delta,
                                       in_place, max_relative_error,
-                                       user_defined_grads)
+                                       user_defined_grads, sum_outputs)

    def check_grad_with_place(self,
                              place,
@ -412,7 +416,8 @@ class OpTest(unittest.TestCase):
                              numeric_grad_delta=0.005,
                              in_place=False,
                              max_relative_error=0.005,
-                              user_defined_grads=None):
+                              user_defined_grads=None,
+                              sum_outputs=None):
        self.scope = core.Scope()
        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
@ -435,7 +440,8 @@ class OpTest(unittest.TestCase):
                input_to_check,
                output_names,
                delta=numeric_grad_delta,
-                in_place=in_place) for input_to_check in inputs_to_check
+                in_place=in_place,
+                sum_outputs=sum_outputs) for input_to_check in inputs_to_check
        ]
        analytic_grads = self._get_gradient(inputs_to_check, place,
                                            output_names, no_grad_set)
--- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py