Gradient check use graph (#5027)

* Simplize Gradient Check * Stash * Extract apply_backward_pass to backward.py Rename apply_backward_pass to append_backward_ops * Use graph API to check gradient * Fix ci * Fix CI * Fix backward for double precision * Stash * Fix CI * Fix ci * Ignore GRU test * Ignore xe op * Fix CI * Fix softmax with xe gradient The correct equation should be IG = OG * (d_softmax_with_xe()) * Fix typo * Fix merge error * Disable LRN
8 years ago · be00b0c4d6
parent fc68290bcc
commit be00b0c4d6
36 changed files with 326 additions and 206 deletions
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -26,7 +26,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info operator)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info operator glog)

 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -452,11 +452,13 @@ ParamGradInfoMap AppendBackward(
  std::transform(target_shape_desc.begin(), target_shape_desc.end(),
                 std::back_inserter(target_shape),
                 [](int64_t dim) { return static_cast<int>(dim); });
+  VLOG(3) << "backward from loss=" << target.Name()
+          << " data_type=" << target.GetDataType();
  std::unique_ptr<OpDescBind> fill_one_op(
      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
                     {{"shape", target_shape},
                      {"value", static_cast<float>(1.0)},
-                      {"data_type", framework::DataType::FP32}}));
+                      {"data_type", target.GetDataType()}}));
  root_block->AppendAllocatedOp(std::move(fill_one_op));
  size_t forward_op_num = root_block->OpSize();
  size_t forward_block_num = program_desc.Size();
@ -475,8 +477,7 @@ ParamGradInfoMap AppendBackward(
  std::unordered_map<std::string, GradVarInfo> retv;

  auto var = root_block->Var(fill_one_op_out);
-  // FIXME(qiao) infer the data type
-  var->SetDataType(framework::DataType::FP32);
+  var->SetDataType(target.GetDataType());
  var->SetShape(target.Shape());
  auto& target_grad = retv[target.Name()];
  target_grad.name_ = fill_one_op_out;
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"

+#include "glog/logging.h"
+
 namespace paddle {
 namespace framework {

@ -262,6 +264,7 @@ void OpDescBind::CheckAttrs() {
 }

 void OpDescBind::InferShape(const BlockDescBind &block) const {
+  VLOG(3) << "CompileTime infer shape on " << Type();
  auto &funcs = InferShapeFuncs();
  auto it = funcs.find(this->Type());
  if (it == funcs.end()) {
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -414,7 +414,9 @@ class CompileTimeInferShapeContext : public InferShapeContext {

 private:
  DDim GetDim(const std::string& name) const override {
-    return framework::make_ddim(block_.FindVarRecursive(name)->Shape());
+    auto var = block_.FindVarRecursive(name);
+    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+    return framework::make_ddim(var->Shape());
  }

  void SetDim(const std::string& name, const DDim& dim) override {
@ -658,8 +660,9 @@ class OperatorWithKernel : public OperatorBase {
          }
          if (t != nullptr) {
            int tmp = static_cast<int>(ToDataType(t->type()));
+            VLOG(3) << "Input " << ipt_name << " with data_type " << tmp;
            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                           "DataType of Paddle Op must be same.");
+                           "DataType of Paddle Op %s must be same.", Type());
            data_type = tmp;
          }
        }
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@ -446,12 +446,16 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp,
 REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
            hard_sigmoid_grad, ops::ActivationOpGrad);

-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)        \
-  REGISTER_OP_CPU_KERNEL(                                                      \
-      act_type,                                                                \
-      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>); \
-  REGISTER_OP_CPU_KERNEL(act_type##_grad,                                      \
-                         ops::ActivationGradKernel<paddle::platform::CPUPlace, \
-                                                   ops::grad_functor<float>>);
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \
+  REGISTER_OP_CPU_KERNEL(                                                     \
+      act_type,                                                               \
+      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
+      ops::ActivationKernel<paddle::platform::CPUPlace,                       \
+                            ops::functor<double>>);                           \
+  REGISTER_OP_CPU_KERNEL(                                                     \
+      act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace,  \
+                                                 ops::grad_functor<float>>,   \
+      ops::ActivationGradKernel<paddle::platform::CPUPlace,                   \
+                                ops::grad_functor<double>>);

 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@ -17,12 +17,16 @@

 namespace ops = paddle::operators;

-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)        \
-  REGISTER_OP_GPU_KERNEL(                                                      \
-      act_type,                                                                \
-      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>); \
-  REGISTER_OP_GPU_KERNEL(act_type##_grad,                                      \
-                         ops::ActivationGradKernel<paddle::platform::GPUPlace, \
-                                                   ops::grad_functor<float>>);
+#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)       \
+  REGISTER_OP_GPU_KERNEL(                                                     \
+      act_type,                                                               \
+      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
+      ops::ActivationKernel<paddle::platform::GPUPlace,                       \
+                            ops::functor<double>>);                           \
+  REGISTER_OP_GPU_KERNEL(                                                     \
+      act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace,  \
+                                                 ops::grad_functor<float>>,   \
+      ops::ActivationGradKernel<paddle::platform::GPUPlace,                   \
+                                ops::grad_functor<double>>);

 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@ -210,8 +210,8 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    auto temp1 = (x < (threshold * -1)).template cast<T>().eval();
-    auto temp2 = (x > threshold).template cast<T>().eval();
+    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
    y.device(d) = x * (temp1 + temp2);
  }
 };
@ -226,8 +226,8 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto temp1 = (x < (threshold * -1)).template cast<T>().eval();
-    auto temp2 = (x > threshold).template cast<T>().eval();
+    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
  }
 };
@ -243,9 +243,10 @@ struct SoftShrinkFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    auto temp1 = (x > lambda).template cast<T>().eval();
-    auto temp2 = (x < -lambda).template cast<T>().eval();
-    y.device(d) = temp1 * (x - lambda) + temp2 * (x + lambda);
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>().eval();
+    auto temp2 = (x < -lambdaT).template cast<T>().eval();
+    y.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
  }
 };

@ -257,8 +258,9 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto temp1 = (x > lambda).template cast<T>().eval();
-    auto temp2 = (x < -lambda).template cast<T>().eval();
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>().eval();
+    auto temp2 = (x < -lambdaT).template cast<T>().eval();
    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
  }
 };
@ -362,7 +364,8 @@ struct BReluFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(t_min).cwiseMin(t_max);
+    y.device(d) =
+        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
  }
 };

@ -375,7 +378,9 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
+    dx.device(d) = dy *
+                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
+                       .template cast<T>();
  }
 };

@ -390,7 +395,8 @@ struct Relu6Functor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(static_cast<T>(0)).cwiseMin(threshold);
+    y.device(d) =
+        x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
  }
 };

@ -402,8 +408,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) =
-        dy * ((x > static_cast<T>(0)) * (x < threshold)).template cast<T>();
+    dx.device(d) = dy *
+                   ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
+                       .template cast<T>();
  }
 };

@ -463,7 +470,8 @@ struct SoftReluFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold);
+    auto tmp = static_cast<T>(threshold);
+    auto temp = x.cwiseMax(-tmp).cwiseMin(tmp);
    y.device(d) = (static_cast<T>(1) + temp.exp()).log();
  }
 };
@ -476,7 +484,8 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
+    auto tmp = static_cast<T>(threshold);
+    auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
    dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
  }
 };
@ -490,7 +499,7 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(alpha * x);
+    y.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
  }
 };

@ -502,7 +511,8 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto temp1 = alpha * (x < static_cast<T>(0)).template cast<T>().eval();
+    auto temp1 = static_cast<T>(alpha) *
+                 (x < static_cast<T>(0)).template cast<T>().eval();
    auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
  }
@ -517,9 +527,9 @@ struct ELUFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) =
-        x.cwiseMax(static_cast<T>(0)) +
-        (alpha * (x.exp() - static_cast<T>(1))).cwiseMin(static_cast<T>(0));
+    y.device(d) = x.cwiseMax(static_cast<T>(0)) +
+                  (static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
+                      .cwiseMin(static_cast<T>(0));
  }
 };

@ -531,9 +541,9 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) =
-        dy * (x > static_cast<T>(0)).template cast<T>() +
-        dy * (y + alpha) * (x < static_cast<T>(0)).template cast<T>();
+    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>() +
+                   dy * (y + static_cast<T>(alpha)) *
+                       (x < static_cast<T>(0)).template cast<T>();
  }
 };

@ -545,7 +555,7 @@ struct PowFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.pow(factor);
+    y.device(d) = x.pow(static_cast<T>(factor));
  }
 };

@ -557,7 +567,8 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * factor * x.pow(factor - static_cast<T>(1));
+    dx.device(d) = dy * static_cast<T>(factor) *
+                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
  }
 };

@ -571,7 +582,8 @@ struct STanhFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = scale_b * (scale_a * x).tanh();
+    y.device(d) =
+        static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
  }
 };

@ -585,8 +597,10 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
-    dx.device(d) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
+    auto a = static_cast<T>(scale_a);
+    auto b = static_cast<T>(scale_b);
+    auto temp = (a * x).tanh() * (a * x).tanh();
+    dx.device(d) = dy * a * b * (static_cast<T>(1) - temp);
  }
 };

@ -599,7 +613,8 @@ struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = (x > static_cast<T>(threshold)).template cast<T>() * x;
+    auto th = static_cast<T>(threshold);
+    y.device(d) = (x > th).template cast<T>() * x;
  }
 };

@ -612,7 +627,8 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (x > static_cast<T>(threshold)).template cast<T>();
+    auto th = static_cast<T>(threshold);
+    dx.device(d) = dy * (x > th).template cast<T>();
  }
 };

--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@ -64,5 +64,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
                             ops::FillConstantOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>);
+    fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
+    ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
+    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>);
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/fill_constant_op.cu
@ -18,5 +18,6 @@

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>);
+    fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
+    ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
+    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>);
--- a/paddle/operators/fill_constant_op.h
+++ b/paddle/operators/fill_constant_op.h
@ -25,7 +25,7 @@ class FillConstantOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* out = ctx.Output<framework::Tensor>("Out");
    out->mutable_data<T>(ctx.GetPlace());
-    auto value = ctx.Attr<T>("value");
+    auto value = ctx.Attr<float>("value");

    auto out_eigen = framework::EigenVector<T>::Flatten(*out);
    auto place = ctx.GetEigenDevice<Place>();
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@ -171,8 +171,7 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(
        weight_width, frame_size * 3,
        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    auto bias = Input("Bias");
-    if (bias != framework::kEmptyVarName) {
+    if (ctx->HasInput("Bias")) {
      auto bias_dims = ctx->GetInputDim("Bias");
      int bias_height = bias_dims[0];
      int bias_width = bias_dims[1];
@ -203,6 +202,8 @@ namespace ops = paddle::operators;
 REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
            ops::GRUUnitGradOp);
 REGISTER_OP_CPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<paddle::platform::CPUPlace, float>);
+                       ops::GRUUnitKernel<paddle::platform::CPUPlace, float>,
+                       ops::GRUUnitKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>);
+    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/operators/gru_unit_op.cu
@ -17,6 +17,8 @@

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<paddle::platform::GPUPlace, float>);
+                       ops::GRUUnitKernel<paddle::platform::GPUPlace, float>,
+                       ops::GRUUnitKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>);
+    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GRUUnitGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@ -71,7 +71,8 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
 REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
-REGISTER_OP_CPU_KERNEL(mean,
-                       ops::MeanKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<paddle::platform::CPUPlace, float>,
+                       ops::MeanKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(mean_grad,
-                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>);
+                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::MeanGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@ -17,7 +17,8 @@
 #include "paddle/operators/mean_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mean,
-                       ops::MeanKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<paddle::platform::GPUPlace, float>,
+                       ops::MeanKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(mean_grad,
-                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>);
+                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::MeanGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@ -73,4 +73,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
                  ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(scale,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, float>);
+                       ops::ScaleKernel<paddle::platform::CPUPlace, float>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@ -15,4 +15,5 @@
 #include "paddle/operators/scale_op.h"

 REGISTER_OP_GPU_KERNEL(
-    scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>);
+    scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
@ -19,7 +19,7 @@

 namespace paddle {
 namespace operators {
-template <typename Place, typename T, typename AttrType = T>
+template <typename Place, typename T>
 class ScaleKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& context) const {
@ -27,7 +27,7 @@ class ScaleKernel : public framework::OpKernel<T> {
    auto* in = context.Input<framework::Tensor>("X");
    tensor->mutable_data<T>(in->place());

-    auto scale = static_cast<T>(context.Attr<AttrType>("scale"));
+    auto scale = static_cast<T>(context.Attr<float>("scale"));

    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@ -23,18 +23,21 @@ using Tensor = framework::Tensor;

 namespace {
 template <typename T>
-__global__ void CrossEntropyGrad(T* out_grad, const T* in_grad,
+__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
                                 const int* labels, const int batch_size,
                                 const int class_num) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int sample_idx = tid / class_num;

-  if (tid < batch_size * class_num) out_grad[tid] *= in_grad[sample_idx];
-  __syncthreads();
-
  if (tid < batch_size) {
    PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num);
-    out_grad[tid * class_num + labels[tid]] -= 1.;
+    logit_grad[tid * class_num + labels[tid]] -= static_cast<T>(1.);
+  }
+
+  __syncthreads();
+
+  if (tid < batch_size * class_num) {
+    logit_grad[tid] *= loss_grad[sample_idx];
  }
 }

@ -47,7 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
  int ids = blockIdx.x * blockDim.x + threadIdx.x;
  if (ids < batch_size * class_num) {
    int row_ids = ids / class_num;
-    logit_grad[ids] = logit_grad[ids] * loss_grad[row_ids] - labels[ids];
+    logit_grad[ids] = logit_grad[ids] * (loss_grad[row_ids] - labels[ids]);
  }
 }
 }  // namespace
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@ -67,8 +67,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {

      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
          logit_grad_mat *
-              out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
-          lbl_mat;
+          (out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
+           lbl_mat);
    } else {
      const int batch_size = logit_grad->dims()[0];
      const int* label_data = labels->data<int>();
@ -78,7 +78,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
      for (int i = 0; i < batch_size; ++i) {
        int index = i * class_num + label_data[i];
        logit_grad_data[index] =
-            (out_grad_data[i] * logit_grad_data[index] - 1.);
+            out_grad_data[i] * (logit_grad_data[index] - 1.);
      }
    }
  }
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@ -95,17 +95,18 @@ class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };

-class SplitOpGrad : public NetOp {
+class SplitGradMaker : public framework::SingleGradOpDescMaker {
 public:
-  SplitOpGrad(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    auto out_grad = Inputs(framework::GradVarName("Out"));
-    auto x_grad = Output(framework::GradVarName("X"));
-    AppendOp(framework::OpRegistry::CreateOp("concat", {{"X", out_grad}},
-                                             {{"Out", {x_grad}}}, attrs));
-    CompleteAddOp(false);
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto op = new framework::OpDescBind();
+    op->SetType("concat");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(op);
  }
 };

@ -114,7 +115,7 @@ class SplitOpGrad : public NetOp {

 namespace ops = paddle::operators;
 USE_CPU_ONLY_OP(concat);
-REGISTER_OP(split, ops::SplitOp, ops::SplitOpMaker, split_grad,
-            ops::SplitOpGrad);
+
+REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
 REGISTER_OP_CPU_KERNEL(split,
                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@ -84,4 +84,5 @@ class SumGradMaker : public framework::GradOpDescMakerBase {
 namespace ops = paddle::operators;

 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker);
-REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>,
+                       ops::SumKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@ -13,4 +13,5 @@ limitations under the License. */
 #include "paddle/operators/sum_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>,
+                       ops::SumKernel<paddle::platform::GPUPlace, double>);
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@ -335,7 +335,7 @@ class TestSoftplus(OpTest):
    def setUp(self):
        self.op_type = "softplus"
        self.inputs = {
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+            'X': np.random.uniform(-1, 1, [11, 17]).astype("float64")
        }
        self.outputs = {'Y': np.log(1 + np.exp(self.inputs['X']))}

--- a/python/paddle/v2/framework/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_batch_norm_op.py
@ -1,10 +1,25 @@
 import unittest
 import numpy as np
-from op_test import OpTest, get_backward_op, grad_var_name
+from op_test import OpTest
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator


+def grad_var_name(var_name):
+    return var_name + "@GRAD"
+
+
+def get_backward_op(scope, op, no_grad_set):
+    backward_op = core.Operator.backward(op, no_grad_set)
+    for input in backward_op.input_vars():
+        var = scope.var(input)
+        var.get_tensor()
+    for output in backward_op.output_vars():
+        var = scope.var(output)
+        var.get_tensor()
+    return backward_op
+
+
 def _reference_training(x, scale, offset, epsilon, data_format):
    if data_format != "NHWC":
        raise ValueError("data_format must be NHWC, got %s." % data_format)
--- a/Show More
+++ b/Show More