Fix cpu_bfloat16_pass (#28730)

* Fix cpu_bfloat16_pass * Add output_format * Fix incorrect SetOutput * Change fromating
5 years ago · fddea67445
parent 2fd16cf6fc
commit fddea67445
4 changed files with 315 additions and 109 deletions
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -2181,6 +2181,36 @@ PDNode *patterns::FirstBfloat16Ops::operator()() {
  return op;
 }

+PDNode *patterns::DuplicatedInputs::operator()() {
+  auto op = pattern->NewNode(op_repr())->assert_is_ops({"concat", "sum"});
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  return op;
+}
+
+PDNode *patterns::UnnecessaryReorders::operator()() {
+  auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+  prev_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+
+  auto *quant_in = pattern->NewNode(quant_in_repr())
+                       ->assert_is_op_input("quantize", "Input");
+
+  auto *quant_op = pattern->NewNode(quant_op_repr())->assert_is_op("quantize");
+
+  auto *quant_out = pattern->NewNode(quant_out_repr())
+                        ->assert_is_op_output("quantize", "Output");
+
+  prev_op->LinksTo({quant_in});
+  quant_op->LinksFrom({quant_in}).LinksTo({quant_out});
+
+  return quant_out;
+}
+
 PDNode *patterns::MKLDNNInPlace::operator()() {
  const std::unordered_set<std::string> &supported_op_types = {
      "abs",
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -1273,6 +1273,26 @@ struct FirstBfloat16Ops : public PatternBase {
  PATTERN_DECL_NODE(op);
 };

+struct DuplicatedInputs : public PatternBase {
+  DuplicatedInputs(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "many_inputs_op") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(op);
+};
+
+struct UnnecessaryReorders : public PatternBase {
+  UnnecessaryReorders(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "unnecessary_reorders") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(quant_in);
+  PATTERN_DECL_NODE(quant_op);
+  PATTERN_DECL_NODE(quant_out);
+};
+
 // Pattern used for enforcing inplace computation for in-place computation
 // supporting DNNL ops. softmax, batch_norm and layer_norm
 struct MKLDNNInPlace : public PatternBase {
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
@ -42,60 +42,45 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
             type == "dropout") {
    op->SetInput("X", {inputs[0]});
    op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+    if (type != "dropout") op->SetAttr("mkldnn_data_type", mkldnn_data_type);
  } else if (type == "fc") {
    op->SetInput("Input", {inputs[0]});
    op->SetOutput("Out", {outputs[0]});
    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
-  } else if (type == "concat") {
+    op->SetAttr("force_fp32_output", force_fp32_output);
+  } else if (type == "concat" || type == "sum") {
    op->SetInput("X", inputs);
    op->SetOutput("Out", outputs);
    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
-  } else if (type == "matmul" || type == "elementwise_add") {
+  } else if (type == "matmul" || type == "elementwise_add" ||
+             type == "elementwise_mul") {
    op->SetInput("X", {inputs[0]});
    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
    op->SetOutput("Out", {outputs[0]});
    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+    if (type == "matmul") op->SetAttr("force_fp32_output", force_fp32_output);
+  } else if (type == "layer_norm") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Y", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
  }
 }

+static const std::initializer_list<std::string> variable_names{
+    "z", "a", "b", "c", "d", "e", "f", "g", "h", "i"};
+
 void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
                 const std::initializer_list<std::string> variable_names,
                 int* original_nodes_num, int* current_nodes_num) {
  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_pass");

-  graph->reset(pass->Apply(graph->release()));
-
  *original_nodes_num = (*graph)->Nodes().size();
  (*graph).reset(pass->Apply((*graph).release()));
  *current_nodes_num = (*graph)->Nodes().size();
 }

-static const std::initializer_list<std::string> variable_names{
-    "z", "a", "b", "c", "d", "e", "f", "g", "h", "i"};
-
-ProgramDesc BuildProgramDesc(bool use_mkldnn) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "dropout", "Dropout1", {"z"}, {"a"}, use_mkldnn, "float32");
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, "bfloat16");
-  SetOp(&prog, "pool2d", "Pool1", {"b"}, {"c"}, use_mkldnn, "bfloat16");
-  SetOp(&prog, "conv2d", "Conv1", {"c"}, {"d"}, use_mkldnn, "bfloat16");
-  SetOp(&prog, "dropout", "Dropout2", {"d"}, {"e"}, use_mkldnn, "float32");
-  SetOp(&prog, "transpose2", "Transpose1", {"e"}, {"f"}, use_mkldnn,
-        "bfloat16");
-  SetOp(&prog, "reshape2", "Reshape1", {"f"}, {"g"}, use_mkldnn, "bfloat16");
-  SetOp(&prog, "concat", "Concat1", {"g"}, {"h"}, use_mkldnn, "bfloat16");
-  SetOp(&prog, "dropout", "Dropout3", {"h"}, {"i"}, use_mkldnn, "float32");
-
-  return prog;
-}
-
-void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
-              int transpose_count, int quant_count, int dequant_count,
-              int added_nodes_count) {
+void MainTest(const ProgramDesc& prog, int quant_count, int dequant_count,
+              int force_fp32_count, int added_nodes_count) {
  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
  int original_nodes_num, current_nodes_num;
  PreparePass(&graph, prog, variable_names, &original_nodes_num,
@ -103,39 +88,114 @@ void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,

  int quantize_nodes_count = 0;
  int dequantize_nodes_count = 0;
-  int conv2d_nodes_count = 0;
-  int pool2d_nodes_count = 0;
-  int transpose2_nodes_count = 0;
-
+  int force_fp32_nodes_count = 0;
  for (auto* node : graph->Nodes()) {
    if (node->IsOp()) {
      auto* op = node->Op();
-      if (op->Type() == "conv2d") {
-        conv2d_nodes_count++;
-      } else if (op->Type() == "pool2d") {
-        pool2d_nodes_count++;
-      } else if (op->Type() == "transpose2") {
-        transpose2_nodes_count++;
-      } else if (op->Type() == "quantize") {
+      if (op->Type() == "quantize") {
        quantize_nodes_count++;
      } else if (op->Type() == "dequantize") {
        dequantize_nodes_count++;
+      } else if (op->Type() == "conv2d" || op->Type() == "matmul" ||
+                 op->Type() == "fc") {
+        if (op->GetAttrIfExists<bool>("force_fp32_output"))
+          force_fp32_nodes_count++;
      }
    }
  }
-  EXPECT_EQ(conv2d_nodes_count, conv_count);
-  EXPECT_EQ(pool2d_nodes_count, pool_count);
-  EXPECT_EQ(transpose2_nodes_count, transpose_count);
  EXPECT_EQ(quantize_nodes_count, quant_count);
  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(force_fp32_nodes_count, force_fp32_count);
  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
 }

-TEST(CpuQuantizePass, quantize) {
+ProgramDesc BuildProgramDescConv(bool use_mkldnn) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, use_mkldnn, "float32");
+  SetOp(&prog, "conv2d", "Conv1", {"b"}, {"c"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "pool2d", "Pool", {"c"}, {"d"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "transpose2", "Transpose", {"e"}, {"f"}, use_mkldnn, "float32");
+
+  return prog;
+}
+
+TEST(CpuBfloat16Pass, convolution) {
+  bool use_mkldnn = true;
+  // 0 added + 1 force_fp32_output
+  int added_nodes = 0;
+  MainTest(BuildProgramDescConv(use_mkldnn), 0, 0, 1, added_nodes);
+}
+
+ProgramDesc BuildProgramDescDoubleInput(bool use_mkldnn) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, use_mkldnn, "float32");
+  SetOp(&prog, "matmul", "Matmul", {"b", "b"}, {"c"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "transpose2", "Transpose", {"d"}, {"e"}, use_mkldnn, "float32");
+  SetOp(&prog, "elementwise_add", "ElemetwiseAdd", {"c", "e"}, {"f"},
+        use_mkldnn, "bfloat16");
+  SetOp(&prog, "reshape2", "Reshape", {"f"}, {"g"}, use_mkldnn, "bfloat16");
+
+  return prog;
+}
+
+TEST(CpuBfloat16Pass, double_input_ops) {
+  bool use_mkldnn = true;
+  // 2 quant + 2 quant out
+  int added_nodes = 4;
+  MainTest(BuildProgramDescDoubleInput(use_mkldnn), 2, 0, 0, added_nodes);
+}
+
+ProgramDesc BuildProgramDescDuplicatedInput(bool use_mkldnn) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout1", {"a"}, {"b"}, use_mkldnn, "float32");
+  SetOp(&prog, "dropout", "Dropout2", {"c"}, {"d"}, use_mkldnn, "float32");
+  SetOp(&prog, "concat", "Concat", {"b", "d"}, {"e"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "transpose2", "Transpose", {"f"}, {"g"}, use_mkldnn, "float32");
+  SetOp(&prog, "sum", "Sum", {"e", "g"}, {"h"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "reshape2", "Reshape", {"h"}, {"i"}, use_mkldnn, "bfloat16");
+
+  return prog;
+}
+
+TEST(CpuBfloat16Pass, duplicated_input_ops) {
+  bool use_mkldnn = true;
+  // 3 quant + 3 quant out
+  int added_nodes = 6;
+  MainTest(BuildProgramDescDuplicatedInput(use_mkldnn), 3, 0, 0, added_nodes);
+}
+
+ProgramDesc BuildProgramDescDoubleOutputs(bool use_mkldnn) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "layer_norm", "LayerNorm1", {"a"}, {"b"}, use_mkldnn,
+        "bfloat16");
+  SetOp(&prog, "dropout", "Dropout1", {"b"}, {"c"}, use_mkldnn, "float32");
+  SetOp(&prog, "transpose2", "Transpose", {"b"}, {"d"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "layer_norm", "LayerNorm2", {"d"}, {"e"}, use_mkldnn,
+        "bfloat16");
+  SetOp(&prog, "reshape2", "Reshape", {"e"}, {"f"}, use_mkldnn, "float32");
+  SetOp(&prog, "dropout", "Dropout2", {"e"}, {"g"}, use_mkldnn, "float32");
+
+  return prog;
+}
+
+TEST(CpuBfloat16Pass, double_outputs_ops) {
  bool use_mkldnn = true;
-  // 1 quantize + 1 dequantize
-  int added_nodes = 2;
-  MainTest(BuildProgramDesc(use_mkldnn), 2, 1, 1, 1, 2, added_nodes);
+  // 3 dequant + 3 dequant out
+  int added_nodes = 6;
+  MainTest(BuildProgramDescDoubleOutputs(use_mkldnn), 0, 3, 0, added_nodes);
 }

 }  // namespace ir