Add pre-condition check for fuse optimizer op pass (#21005)

* add pre condition check for fuse optimizer op pass, test=develop * add log & set init to zero, test=develop * fix test_fuse_all_reduce_pass failed, test=develop * polish details, test=develop * refine PADDLE_ENFORCE & remove needless VLOG, test=develop * refactor op check method, test=develop
6 years ago · 826254f664
parent 6cc544aa28
commit 826254f664
3 changed files with 205 additions and 112 deletions
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
@ -55,25 +55,26 @@ class FuseOptimizerOpPass : public ir::Pass {
      const std::unordered_map<std::string, std::string> &fused_vars_name,
      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;

-  void GetSpecifiedOpsAndVars(
+  void GetFusingVarNamesMap(
      const std::vector<std::string> &aux_vars_name,
      const std::vector<ir::Node *> &opt_nodes,
      std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
      const;

-  void AppendAllocContinuousSpace(const std::vector<std::string> &in_args,
-                                  const std::vector<std::string> &out_args,
-                                  const std::string &fused_out_arg,
-                                  const proto::VarType::Type &dtype,
-                                  BlockDesc *global_block, bool copy_data,
-                                  bool check_name = true) const;
+  void AppendCoalesceTensorOp(const std::vector<std::string> &in_args,
+                              const std::vector<std::string> &out_args,
+                              const std::string &fused_out_arg,
+                              const proto::VarType::Type &dtype,
+                              BlockDesc *global_block, bool copy_data,
+                              bool check_name = true) const;

-  void InitFusedGradsAndAllocSpaceForGrads(
-      const std::vector<std::string> &params,
-      const std::vector<std::string> &grads, const std::string &fused_grad_name,
-      const proto::VarType::Type &dtype, ir::Graph *result) const;
+  void FuseGradientsToContinuousSpace(const std::vector<std::string> &params,
+                                      const std::vector<std::string> &grads,
+                                      const std::string &fused_grad_name,
+                                      const proto::VarType::Type &dtype,
+                                      ir::Graph *result) const;

-  void InitFusedVarsAndAllocSpaceForVars(
+  void FuseVarsToContinuousSpace(
      const std::vector<std::string> &aux_var_names,
      const std::unordered_map<std::string, std::vector<std::string>>
          &aux_var_set,
@ -83,6 +84,12 @@ class FuseOptimizerOpPass : public ir::Pass {
  std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
      const Graph &result) const;

+  bool OpWithKernelSupportCPUAndGPU(const std::string &op_type) const;
+
+  bool GradGeneratedOpKernelCheck(
+      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
+      const std::string &grad_var_name) const;
+
  proto::VarType::Type GetDtypeOfVar(
      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
      const std::string &name) const;
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@ -24,7 +24,7 @@ namespace paddle {
 namespace operators {

 template <typename DeviceContext, typename T>
-class CoalesceTensorOp : public framework::OpKernel<T> {
+class CoalesceTensorOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
    auto &in_var_names = context.Inputs("Input");
@ -32,24 +32,39 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
    auto &in_vars = context.MultiInputVar("Input");
    auto out_vars = context.MultiOutputVar("Output");

-    PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0));
-    PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size());
+    PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0),
+                      "The CoalesceTensorOp has no input.");
+    PADDLE_ENFORCE_EQ(
+        in_var_names.size(), out_var_names.size(),
+        "The number of CoalesceTensorOp's input and output is not match.");

+    // Input & Output check: only support LoDTensor
    for (size_t i = 0; i < in_var_names.size(); ++i) {
-      // Only support LoDTensor
-      PADDLE_ENFORCE_NOT_NULL(in_vars[i], "%s should not be nullptr,",
-                              in_var_names[i]);
-      PADDLE_ENFORCE_NOT_NULL(out_vars[i], "%s should not be nullptr,",
-                              out_var_names[i]);
-      PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensor>());
-      PADDLE_ENFORCE(out_vars[i]->IsType<framework::LoDTensor>());
+      PADDLE_ENFORCE_NOT_NULL(
+          in_vars[i],
+          "The input variable %s of CoalesceTensorOp does not exist.",
+          in_var_names[i]);
+      PADDLE_ENFORCE_NOT_NULL(
+          out_vars[i],
+          "The output variable %s of CoalesceTensorOp does not exist.",
+          out_var_names[i]);
+      PADDLE_ENFORCE_EQ(
+          in_vars[i]->IsType<framework::LoDTensor>(), true,
+          "The input variable %s of CoalesceTensorOp is not LoDTensor.",
+          in_var_names[i]);
+      PADDLE_ENFORCE_EQ(
+          out_vars[i]->IsType<framework::LoDTensor>(), true,
+          "The output variable %s of CoalesceTensorOp is not LoDTensor.",
+          in_var_names[i]);
    }

    auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");

    if (context.Attr<bool>("check_name")) {
      for (size_t i = 0; i < in_var_names.size(); ++i) {
-        PADDLE_ENFORCE_EQ(in_var_names[i], out_var_names[i]);
+        PADDLE_ENFORCE_EQ(
+            in_var_names[i], out_var_names[i],
+            "The input and output variable of CoalesceTensorOp is different.");
      }
    } else {
      // Init the output as input
@ -124,8 +139,8 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
    std::stringstream ss;
    ss << "alloc_space_for_vars: ";
    for (size_t i = 0; i < var_names.size(); ++i) {
-      PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
-                     var_names[i]);
+      PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true,
+                        "%s is not initialized.", var_names[i]);

      auto size = lod_tensors[i]->numel();
      PADDLE_ENFORCE_GT(size, 0);
@ -140,14 +155,14 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
  }
 };

-class AllocContinuousSpaceOp : public framework::OperatorWithKernel {
+class CoalesceTensorOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext *ctx) const override {}
 };

-class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
+class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("Input",
@ -179,7 +194,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
                  "they are the same separately.")
        .SetDefault(false);
    AddComment(R"DOC(
-AllocContinuousSpace Operator.
+CoalesceTensor Operator.

 coalesce_tensor is used to make the address of Output
 continuous according to the Input. This Op will alloc a big tensor
@ -200,22 +215,22 @@ setting the Output with a constant value.
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OPERATOR(coalesce_tensor, paddle::operators::AllocContinuousSpaceOp,
-                  paddle::operators::AllocContinuousSpaceOpMaker);
+REGISTER_OPERATOR(coalesce_tensor, paddle::operators::CoalesceTensorOp,
+                  paddle::operators::CoalesceTensorOpMaker);
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CPU_KERNEL(
    coalesce_tensor,
-    ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, plat::float16>,
-    ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, int>,
-    ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, float>,
-    ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, double>);
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);

 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL(
    coalesce_tensor,
-    ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, int>,
-    ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, float>,
-    ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, double>);
+    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
 #endif