extend elementwise broadcast function (#20957)

6 years ago · 0e7baabe59
parent d623e863c9
commit 0e7baabe59
21 changed files with 1074 additions and 503 deletions
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@ -99,8 +99,8 @@ REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_add, Add);

 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
-    elementwise_add_grad, ops::ElementwiseOpExplicitGrad,
-    ops::ElementwiseGradOpInplace, ops::ElementwiseGradNoBufVarsInference,
+    elementwise_add_grad, ops::ElementwiseOpGrad, ops::ElementwiseGradOpInplace,
+    ops::ElementwiseGradNoBufVarsInference,
    ops::ElementwiseAddDoubleGradMaker<paddle::framework::OpDesc>,
    ops::ElementwiseAddDoubleGradMaker<paddle::imperative::OpBase>);

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@ -25,8 +25,13 @@ void default_elementwise_add(const framework::ExecutionContext &ctx,
                             const framework::Tensor *x,
                             const framework::Tensor *y, framework::Tensor *z) {
  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        AddFunctor<T>(), z);
+  if (x->numel() >= y->numel()) {
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          AddFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseAddFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseAddFunctor<T>(), z);
+  }
 }

 template <typename DeviceContext, typename T, class Enable = void>
@ -128,12 +133,13 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {

    using Tensor = framework::Tensor;

+    auto *x = ctx.Input<Tensor>("X");
+    auto *y = ctx.Input<Tensor>("Y");
    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    // skip out, x, y
+    // skip out
    auto *out = dout;
-    auto *x = dout, *y = dout;

    if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
      elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@ -76,6 +76,7 @@ class ElementwiseDivGradOpMaker : public framework::SingleGradOpMaker<T> {
  std::unique_ptr<T> Apply() const override {
    std::unique_ptr<T> op(new T());
    op->SetType("elementwise_div_grad");
+    op->SetInput("X", this->Input("X"));
    op->SetInput("Y", this->Input("Y"));
    op->SetInput("Out", this->Output("Out"));
    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@ -31,8 +31,13 @@ void default_elementwise_div(const framework::ExecutionContext& ctx,
                             const framework::Tensor* x,
                             const framework::Tensor* y, framework::Tensor* z) {
  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        DivFunctor<T>(), z);
+  if (x->numel() >= y->numel()) {
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          DivFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseDivFunctor<T>(), z);
+  }
 }

 template <typename DeviceContext, typename T, class Enable = void>
@ -112,13 +117,13 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
    ElemwiseGradKernel<T>::Compute(ctx);
    using Tensor = framework::Tensor;

+    auto* x = ctx.Input<Tensor>("X");
    auto* y = ctx.Input<Tensor>("Y");
    auto* out = ctx.Input<Tensor>("Out");
    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    int axis = ctx.Attr<int>("axis");
-    auto* x = dout;  // Fake x, not used

    if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
      elementwise_div_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
@ -191,7 +196,7 @@ class ElementwiseDivDoubleGradKernel : public framework::OpKernel<T> {
    // ddX_safe == null ? 0 : ddX
    // ddY_safe == null ? 0 : ddY
    Tensor ddX_safe, ddY_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Out, ddX, &ddX_safe);
+    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dX, ddX, &ddX_safe);
    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Y, ddY, &ddY_safe);

    // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
@ -209,8 +214,7 @@ class ElementwiseDivDoubleGradKernel : public framework::OpKernel<T> {
    if (dY) {
      // dX_div_Y = dX / Y;
      Tensor dX_div_Y = tmp;
-      ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(
-          ctx, dX, Y, axis, DivFunctor<T>(), &dX_div_Y);
+      default_elementwise_div<DeviceContext, T>(ctx, dX, Y, &dX_div_Y);

      // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
      // first output tensor is nullptr, the branch to calculate first
@ -227,10 +231,8 @@ class ElementwiseDivDoubleGradKernel : public framework::OpKernel<T> {
    if (ddOut) {
      // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
      default_elementwise_mul<DeviceContext, T>(ctx, Out, &ddY_safe, &tmp);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &ddX_safe, &tmp, 0, SubFunctor<T>(), &tmp);
-      ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(
-          ctx, &tmp, Y, axis, DivFunctor<T>(), ddOut);
+      default_elementwise_sub<DeviceContext, T>(ctx, &ddX_safe, &tmp, &tmp);
+      default_elementwise_div<DeviceContext, T>(ctx, &tmp, Y, ddOut);
    }

    if (dOut) {
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@ -26,9 +26,15 @@ void default_elementwise_mul(const framework::ExecutionContext& ctx,
                             const framework::Tensor* x,
                             const framework::Tensor* y, framework::Tensor* z) {
  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        MulFunctor<T>(), z);
+  if (x->numel() >= y->numel()) {
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MulFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseMulFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseMulFunctor<T>(), z);
+  }
 }
+
 template <typename DeviceContext, typename T, class Enable = void>
 struct SameDimsElemwiseMul {
  void operator()(const framework::ExecutionContext& ctx,
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@ -14,12 +14,15 @@ limitations under the License. */

 #pragma once

+#include <algorithm>  // for max
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"

 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@ -35,12 +38,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
  using Tensor = framework::Tensor;

  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of elementwise op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of elementwise op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of elementwise op should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      "Input(X) of elementwise op should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
+                      "Input(Y) of elementwise op should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      "Output(Out) of elementwise op should not be null.");

    PADDLE_ENFORCE(
        ctx->GetInputsVarType("Y").front() ==
@ -49,18 +52,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
        ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front());

    if (ctx->GetInputsVarType("X").front() ==
-        framework::proto::VarType::LOD_TENSOR) {
-      auto x_dim = ctx->GetInputDim("X");
-      auto y_dim = ctx->GetInputDim("Y");
-      PADDLE_ENFORCE_GE(
-          x_dim.size(), y_dim.size(),
-          "ShapeError: the dimension of input X must greater than or equal to "
-          "the one of input Y. But received: the shape of input X = [%s], the "
-          "dimension of input X = %d, the shape of input Y = [%s], the "
-          "dimension of input Y = %d",
-          x_dim, x_dim.size(), y_dim, y_dim.size());
-    } else if (ctx->GetInputsVarType("X").front() ==
-               framework::proto::VarType::SELECTED_ROWS) {
+        framework::proto::VarType::SELECTED_ROWS) {
      PADDLE_ENFORCE_EQ(
          ctx->GetInputDim("Y").size(), 1u,
          "ShapeError: For elementwise_op, if X is Sparse(VarType.SELECTED_ROWS"
@ -71,13 +63,31 @@ class ElementwiseOp : public framework::OperatorWithKernel {
          "ShapeError: For elementwise_op, if X is Sparse(VarType.SELECTED_ROWS"
          "), Y must be scalar. But reveived the first dimension of Y = %s",
          ctx->GetInputDim("Y")[0]);
-    } else {
+    } else if (ctx->GetInputsVarType("X").front() !=
+               framework::proto::VarType::LOD_TENSOR) {
      PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
                   ctx->GetInputsVarType("X").front());
    }

-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
+    if (ctx->GetInputDim("X") == ctx->GetInputDim("Y")) {
+      ctx->ShareDim("X", /*->*/ "Out");
+      ctx->ShareLoD("X", /*->*/ "Out");
+    } else {
+      auto x_dims = ctx->GetInputDim("X");
+      auto y_dims = ctx->GetInputDim("Y");
+      int max_dim = std::max(x_dims.size(), y_dims.size());
+      int axis = ctx->Attrs().Get<int>("axis");
+      axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+      std::vector<int> x_dims_array(max_dim);
+      std::vector<int> y_dims_array(max_dim);
+      std::vector<int> out_dims_array(max_dim);
+      GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                             y_dims_array.data(), out_dims_array.data(),
+                             max_dim, axis);
+      ctx->SetOutputDim("Out", framework::make_ddim(out_dims_array));
+      // to do
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
  }

  framework::OpKernelType GetExpectedKernelType(
@ -207,26 +217,14 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {

  void InferShape(framework::InferShapeContext *ctx) const override {
    auto out_grad_name = framework::GradVarName("Out");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(out_grad_name),
-                   "Input(Out@GRAD) should not be null");
-
-    auto x_dims = ctx->GetInputDim(out_grad_name);
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), y_dims.size(),
-        "ShapeError: the dimension of Out@GRAD must greater than or equal to "
-        "the one of input Y. But received: the shape of Out@GRAD = [%s], the "
-        "dimension of Out@GRAD = %d, the shape of input Y = [%s], the "
-        "dimension of of input Y = %d",
-        x_dims, x_dims.size(), y_dims, y_dims.size());
-
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true, "Input(Y) should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput(out_grad_name), true,
+                      "Input(Out@GRAD) should not be null.");
    auto x_grad_name = framework::GradVarName("X");
    auto y_grad_name = framework::GradVarName("Y");
    if (ctx->HasOutput(x_grad_name)) {
-      ctx->ShareDim(out_grad_name, /*->*/ x_grad_name);
-      ctx->ShareLoD(out_grad_name, /*->*/ x_grad_name);
+      ctx->ShareDim("X", /*->*/ x_grad_name);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
    }
    if (ctx->HasOutput(y_grad_name)) {
      ctx->ShareDim("Y", /*->*/ y_grad_name);
@ -326,32 +324,6 @@ class ElementwiseOpDoubleGradWithoutDXDY
  }
 };

-// For Add, Sub op, the X, Out is not needed.
-class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
- public:
-  using operators::ElementwiseOpGrad::ElementwiseOpGrad;
-  using operators::ElementwiseOpGrad::GetExpectedKernelType;
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->ShareDim(framework::GradVarName("Out"), /*->*/ x_grad_name);
-      ctx->ShareLoD(framework::GradVarName("Out"), /*->*/ x_grad_name);
-    }
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(y_grad_name)) {
-      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-
-      ctx->ShareDim("Y", /*->*/ y_grad_name);
-      ctx->ShareLoD("Y", /*->*/ y_grad_name);
-    }
-  }
-};
-
 template <typename T>
 class ElemwiseGradKernel : public framework::OpKernel<T> {
 public:
@ -372,13 +344,13 @@ DECLARE_INPLACE_OP_INFERER(ElementwiseGradOpInplace,
                            framework::GradVarName("X")});
 DECLARE_INPLACE_OP_INFERER(ElementwiseDoubleGradOpInplace, {"DDX", "DDOut"});

-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "Y");
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "X",
+                                      "Y");
 DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseDoubleGradNoBufVarsInference,
                                      "Y", "DOut");

 }  // namespace operators
 }  // namespace paddle
-
 #define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name)              \
  template <typename T>                                                 \
  class kernel_type##GradMaker                                          \
@ -390,6 +362,7 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseDoubleGradNoBufVarsInference,
    std::unique_ptr<T> Apply() const override {                         \
      auto *op = new T();                                               \
      op->SetType(#kernel_type "_grad");                                \
+      op->SetInput("X", this->Input("X"));                              \
      op->SetInput("Y", this->Input("Y"));                              \
      op->SetInput(::paddle::framework::GradVarName("Out"),             \
                   this->OutputGrad("Out"));                            \
@ -402,41 +375,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseDoubleGradNoBufVarsInference,
    }                                                                   \
  }

-#define REGISTER_ELEMWISE_OP(op_type, op_name, equation)                    \
-  class __ElemwiseOp##op_type##Maker__                                      \
-      : public ::paddle::operators::ElementwiseOpMaker {                    \
-   protected:                                                               \
-    virtual std::string GetName() const { return op_name; }                 \
-    virtual std::string GetEquation() const { return equation; }            \
-  };                                                                        \
-  REGISTER_OPERATOR(                                                        \
-      op_type, ::paddle::operators::ElementwiseOp,                          \
-      __ElemwiseOp##op_type##Maker__,                                       \
-      ::paddle::operators::ElementwiseOpInferVarType,                       \
-      ::paddle::framework::DefaultGradOpMaker<::paddle::framework::OpDesc,  \
-                                              true>,                        \
-      ::paddle::framework::DefaultGradOpMaker<::paddle::imperative::OpBase, \
-                                              true>);                       \
-  REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
-
-#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation)     \
-  class __ElemwiseOp##op_type##Maker__                                \
-      : public ::paddle::operators::ElementwiseOpMaker {              \
-   protected:                                                         \
-    virtual std::string GetName() const { return op_name; }           \
-    virtual std::string GetEquation() const { return equation; }      \
-  };                                                                  \
-  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,      \
-                    __ElemwiseOp##op_type##Maker__,                   \
-                    ::paddle::operators::ElementwiseOpInferVarType,   \
-                    op_type##GradMaker<::paddle::framework::OpDesc>,  \
-                    op_type##GradMaker<::paddle::imperative::OpBase>, \
-                    ::paddle::operators::ElementwiseOpInplace);       \
-  REGISTER_OPERATOR(op_type##_grad,                                   \
-                    ::paddle::operators::ElementwiseOpExplicitGrad,   \
-                    ::paddle::operators::ElementwiseGradOpInplace,    \
-                    ::paddle::operators::ElementwiseGradNoBufVarsInference)
-
 #define REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(op_type, op_name)    \
  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,        \
                    ::paddle::operators::Elementwise##op_name##OpMaker, \
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
@ -44,6 +44,12 @@ namespace operators {
    inline HOSTDEVICE T operator()(const T& a, const T& b) const { \
      return a expr b;                                             \
    }                                                              \
+  };                                                               \
+  template <typename T, class Enable = void>                       \
+  struct Inverse##Func##Functor {                                  \
+    inline HOSTDEVICE T operator()(const T& a, const T& b) const { \
+      return b expr a;                                             \
+    }                                                              \
  };

 DEFINE_SIMPLE_BINARY_FUNCTOR(Add, +)
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@ -93,13 +93,14 @@ class ElementwiseSubDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle

-namespace ops = paddle::operators;
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
 REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_sub, Sub);

+namespace ops = paddle::operators;
+
 REGISTER_OPERATOR(
-    elementwise_sub_grad, ops::ElementwiseOpExplicitGrad,
-    ops::ElementwiseGradOpInplace, ops::ElementwiseGradNoBufVarsInference,
+    elementwise_sub_grad, ops::ElementwiseOpGrad, ops::ElementwiseGradOpInplace,
+    ops::ElementwiseGradNoBufVarsInference,
    ops::ElementwiseSubDoubleGradMaker<paddle::framework::OpDesc>,
    ops::ElementwiseSubDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(elementwise_sub_grad_grad,
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@ -26,8 +26,13 @@ void default_elementwise_sub(const framework::ExecutionContext& ctx,
                             const framework::Tensor* x,
                             const framework::Tensor* y, framework::Tensor* z) {
  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        SubFunctor<T>(), z);
+  if (x->numel() >= y->numel()) {
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          SubFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseSubFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseSubFunctor<T>(), z);
+  }
 }

 template <typename DeviceContext, typename T, class Enable = void>
@ -98,13 +103,14 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
    ElemwiseGradKernel<T>::Compute(ctx);
    using Tensor = framework::Tensor;

+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    int axis = ctx.Attr<int>("axis");
-    // skip out, x, y
+    // skip out
    auto* out = dout;
-    auto *x = dout, *y = dout;
    if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
      elementwise_sub_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
    } else {
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@ -108,8 +108,9 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
      auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
      axis = (y_dims.size() == 0) ? x_dims.size() : axis;

-      int pre, n, post;
-      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+      int pre, n, post, is_run_common_broadcast;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post,
+                   &is_run_common_broadcast);

      if (post == 1) {
        functor.RunRowWise(n, pre);
@ -212,6 +213,8 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
      }
    } else {
      // Execute default kernel when broadcast is needed
+      x = ctx.Input<Tensor>("X");
+      y = ctx.Input<Tensor>("Y");
      ElemwiseExplicitGradCompute<paddle::platform::CPUDeviceContext, T,
                                  IdentityGrad<T>, IdentityGrad<T>>(
          ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@ -91,8 +91,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
    const bool is_y_format_correct = y->format() == MKLDNNMemoryFormat::nc;
    if (is_x_format_correct && is_y_format_correct && are_dims_divisable &&
        is_avx512_enabled) {
-      int pre, n, post;
-      get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post);
+      int pre, n, post, is_run_common_broadcast;
+      get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post,
+                   &is_run_common_broadcast);

      if (post == 1) {
        PADDLE_THROW("Not implemented when post is 1");
@ -168,8 +169,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
      auto y_dims = trim_trailing_singular_dims(y_dims_untrimmed);
      axis = (y_dims.size() == 0) ? x_dims.size() : axis;

-      int pre, n, post;
-      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+      int pre, n, post, is_run_common_broadcast;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post,
+                   &is_run_common_broadcast);

      if (post == 1) {
        functor.RunRowWise(n, pre);
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
@ -139,21 +140,6 @@ struct DivAndSqrtFunctor {
  T epsilon_;
 };

-template <typename T>
-struct MulFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
-};
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
-};
-
 template <typename T>
 struct MulInvVarFunctor {
  inline HOSTDEVICE T operator()(T a, T b) const {
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@ -32,6 +32,7 @@ TEST(op_debug_str, test_unknown_dtype) {
  framework::Scope scope;

  desc.SetType("elementwise_add_grad");
+  desc.SetInput("X", {"X"});
  desc.SetInput("Y", {"Y"});
  desc.SetInput(framework::GradVarName("Out"), {framework::GradVarName("Out")});
  desc.SetOutput(framework::GradVarName("X"), {framework::GradVarName("X")});
@ -41,6 +42,10 @@ TEST(op_debug_str, test_unknown_dtype) {
  desc.SetAttr("x_data_format", "");
  desc.SetAttr("y_data_format", "");

+  auto x_tensor = scope.Var("X")->GetMutable<framework::LoDTensor>();
+  x_tensor->Resize(dim);
+  x_tensor->mutable_data<float>(place);
+
  auto y_tensor = scope.Var("Y")->GetMutable<framework::LoDTensor>();
  y_tensor->Resize(dim);
  y_tensor->mutable_data<float>(place);
--- a/paddle/fluid/train/demo/run.sh
+++ b/paddle/fluid/train/demo/run.sh
@ -7,8 +7,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas

 # download models
 function download() {
-    wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR/main_program
-    wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR/startup_program
+    wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/main_program
+    wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/startup_program
 }

 download
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@ -308,6 +308,36 @@ class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
        self.axis = -1


+class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 4).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 1, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1, 4, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+
 class TestElementwiseAddOpError(OpTest):
    def test_errors(self):
        with program_guard(Program(), Program()):
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@ -151,6 +151,39 @@ class TestElementwiseDivOp_broadcast_5(ElementwiseDivOp):
        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}


+class TestElementwiseDivOp_commonuse_1(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [1, 1, 4]).astype("float32"),
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseDivOp_commonuse_2(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 1, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 1, 4, 1]).astype("float32"),
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseDivOp_xsize_lessthan_ysize(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [4, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+
 class TestElementwiseDivOp_INT(OpTest):
    def setUp(self):
        self.op_type = "elementwise_div"
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@ -162,6 +162,41 @@ class TestElementwiseMulOpFp16(ElementwiseMulOp):
        self.dtype = np.float16


+class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
+            'Y': np.random.rand(1, 1, 4).astype(np.float64)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+
+
+class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 1, 5).astype(np.float64),
+            'Y': np.random.rand(2, 1, 4, 1).astype(np.float64)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+
+
+class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(4, 5).astype(np.float64),
+            'Y': np.random.rand(2, 3, 4, 5).astype(np.float64)
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 4, 5) * self.inputs['Y']
+        }
+
+
 class TestElementwiseMulOpError(OpTest):
    def test_errors(self):
        with program_guard(Program(), Program()):
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@ -127,5 +127,40 @@ class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}


+class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1, 1, 4).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 1, 5).astype(np.float32),
+            'Y': np.random.rand(2, 1, 4, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(4, 5).astype(np.float32),
+            'Y': np.random.rand(2, 3, 4, 5).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 4, 5) - self.inputs['Y']
+        }
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
@ -47,7 +47,7 @@ class TestExecutorReturnTensorNotOverwritingWithOptest(OpTest):
            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
        }
        self.outputs = {'Out': self.out}
-        self.op_type = "elementwise_mul"
+        self.op_type = "mul"
        self.dtype = np.float32
        outs, fetch_list = self._calc_output(place, parallel=parallel)
        return outs
--- a/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py
@ -57,7 +57,7 @@ def run_trainer(use_cuda, sync_mode, ip, port, trainers, trainer_id):
    exe.run(trainer_startup_program)
    for i in range(5):
        exe.run(recv_program)
-        exe.run(main_program,
+        exe.run(fluid.default_main_program(),
                feed={
                    "x": numpy.array([1, 2]).astype('float32').reshape(2, 1),
                    "y": numpy.array([2, 3]).astype('float32').reshape(2, 1)