[Ernie GPU Optim]: Fuse three fc to multihtead matmul (#22486)

* 1. optim multihead matmul: fuse three fc to multihtead matmul test=develop * fix conflict test=develop * fix comments test=develop
5 years ago · 8acd745c25
parent a8dd425aa3
commit 8acd745c25
7 changed files with 557 additions and 192 deletions
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
@ -32,8 +32,6 @@ struct MultiHeadMatmulPattern : public PatternBase {
  PDNode* operator()(PDNode* x);

  // declare operator node's name
-  // PATTERN_DECL_NODE(dropout);
-  // PATTERN_DECL_NODE(dropout_out);
  PATTERN_DECL_NODE(layer_norm);
  PATTERN_DECL_NODE(layer_norm_out);
  PATTERN_DECL_NODE(mul0);
@ -79,8 +77,6 @@ struct MultiHeadMatmulPattern : public PatternBase {
  PATTERN_DECL_NODE(eltadd_qk_out);
  PATTERN_DECL_NODE(softmax_qk);
  PATTERN_DECL_NODE(softmax_qk_out);
-  // PATTERN_DECL_NODE(dropout_qk);
-  // PATTERN_DECL_NODE(dropout_qk_out);

  PATTERN_DECL_NODE(matmul_qkv);
  PATTERN_DECL_NODE(matmul_qkv_out);
@ -98,6 +94,16 @@ class MultiHeadMatmulFusePass : public FusePassBase {
  const std::string name_scope_{"multihead_matmul_fuse"};
 };

+class MultiHeadMatmulV2FusePass : public FusePassBase {
+ public:
+  virtual ~MultiHeadMatmulV2FusePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"multihead_matmul_fuse_v2"};
+};
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
@ -17,6 +17,27 @@ namespace paddle {
 namespace framework {
 namespace ir {

+void AddVarToScope(Scope* param_scope, const std::string& name,
+                   const DDim& dims) {
+  auto* tensor = param_scope->Var(name)->GetMutable<LoDTensor>();
+  tensor->Resize(dims);
+  tensor->mutable_data<float>(platform::CPUPlace());
+}
+
+Scope* CreateParamScope() {
+  auto param_scope = new Scope();
+  AddVarToScope(param_scope, "weights0", {768, 768});
+  AddVarToScope(param_scope, "weights1", {768, 768});
+  AddVarToScope(param_scope, "weights2", {768, 768});
+
+  AddVarToScope(param_scope, "bias_0", {768});
+  AddVarToScope(param_scope, "bias_1", {768});
+  AddVarToScope(param_scope, "bias_2", {768});
+  AddVarToScope(param_scope, "biasqk", {768});
+  AddVarToScope(param_scope, "weightsl", {768, 768});
+  return param_scope;
+}
+
 TEST(MultiHeadMatmulFusePass, basic) {
  // inputs                           operator            output
  // --------------------------------------------------------------------
@ -87,7 +108,10 @@ TEST(MultiHeadMatmulFusePass, basic) {
  layers.mul(reshape_qkv_out, weights_l);

  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
-  auto pass = PassRegistry::Instance().Get("multihead_matmul_fuse_pass");
+  graph->Set("__param_scope__", CreateParamScope());
+
+  auto pass = PassRegistry::Instance().Get("multihead_matmul_fuse_pass_v2");
+  if (pass.get() == nullptr) LOG(INFO) << "asdfasdf";
  int num_nodes_before = graph->Nodes().size();
  VLOG(3) << DebugString(graph);

@ -96,8 +120,17 @@ TEST(MultiHeadMatmulFusePass, basic) {
  int num_fused_nodes_after = GetNumOpNodes(graph, "multihead_matmul");
  VLOG(3) << DebugString(graph);

-  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 29);
-  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1);
+  PADDLE_ENFORCE_EQ(
+      num_nodes_before, num_nodes_after + 39,
+      platform::errors::InvalidArgument(
+          "After the multihead_matmul pass, The node num in graph "
+          "should be %d, but the result is %d",
+          num_nodes_before - 39, num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1,
+                    platform::errors::InvalidArgument(
+                        "After the multihead_matmul pass, there should be one "
+                        "multihead_matmul op, but the result is %d",
+                        num_fused_nodes_after));
 }

 }  // namespace ir
@ -105,3 +138,4 @@ TEST(MultiHeadMatmulFusePass, basic) {
 }  // namespace paddle

 USE_PASS(multihead_matmul_fuse_pass);
+USE_PASS(multihead_matmul_fuse_pass_v2);
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@ -107,7 +107,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
        "conv_eltwiseadd_affine_channel_fuse_pass",  //
        "conv_bn_fuse_pass",                         //
        "conv_eltwiseadd_bn_fuse_pass",              //
-        "multihead_matmul_fuse_pass",
+        "multihead_matmul_fuse_pass_v2",
        "fc_fuse_pass",                        //
        "fc_elementwise_layernorm_fuse_pass",  //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cc
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cc
@ -15,126 +15,80 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/errors.h"

 namespace paddle {
 namespace operators {

-class MultiHeadMatMulOp : public framework::OperatorWithKernel {
+class MultiHeadMatMulV2Op : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
  void InferShape(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE_EQ(context->HasInput("Q"), true,
-                      "Input(Q) of MultiheadOp should not be null.");
-    PADDLE_ENFORCE_EQ(context->HasInput("K"), true,
-                      "Input(K) of MultiheadOp should not be null.");
-    PADDLE_ENFORCE_EQ(context->HasInput("V"), true,
-                      "Input(V) of MultiheadOp should not be null.");
-    PADDLE_ENFORCE_EQ(context->HasInput("BiasQ"), true,
-                      "Input(BiasQ) of MultiheadOp should not be null.");
-    PADDLE_ENFORCE_EQ(context->HasInput("BiasK"), true,
-                      "Input(BiasQ) of MultiheadOp should not be null.");
-    PADDLE_ENFORCE_EQ(context->HasInput("BiasV"), true,
-                      "Input(BiasQ) of MultiheadOp should not be null.");
-    PADDLE_ENFORCE_EQ(context->HasInput("BiasQK"), true,
-                      "Input(BiasQK) of MultiheadOp should not be null.");
-    PADDLE_ENFORCE_EQ(context->HasOutput("Out"), true,
-                      "Output(Out) of MatMulOp should not be null.");
-
-    auto dim_q = context->GetInputDim("Q");
-    PADDLE_ENFORCE_GT(dim_q.size(), 2,
-                      "Multihead input should be at least 3-D tensor.");
-
-    auto dim_k = context->GetInputDim("K");
-    PADDLE_ENFORCE_GT(dim_q.size(), 2,
-                      "Multihead input should be at least 3-D tensor.");
-
-    auto dim_v = context->GetInputDim("V");
-    PADDLE_ENFORCE_GT(dim_q.size(), 2,
-                      "Multihead input should be at least 3-D tensor.");
-
-    PADDLE_ENFORCE_EQ(dim_q[0], dim_k[0],
-                      "Multihead input should have same batch size");
-    PADDLE_ENFORCE_EQ(dim_q[0], dim_v[0],
-                      "Multihead input should have same batch size");
-
-    PADDLE_ENFORCE_EQ(dim_q[1], dim_k[1],
-                      "Multihead input should have same size");
-    PADDLE_ENFORCE_EQ(dim_q[1], dim_v[1],
-                      "Multihead input should have same size");
-
-    PADDLE_ENFORCE_EQ(dim_q[2], dim_k[2],
-                      "Multihead input should have same size");
-    PADDLE_ENFORCE_EQ(dim_q[2], dim_v[2],
-                      "Multihead input should have same size");
-
-    auto dim_bias_q = context->GetInputDim("BiasQ");
-    PADDLE_ENFORCE_GT(dim_bias_q.size(), 0,
-                      "Multihead input should be at least 1-D tensor.");
-    auto dim_bias_k = context->GetInputDim("BiasK");
-    PADDLE_ENFORCE_GT(dim_bias_k.size(), 0,
-                      "Multihead input should be at least 1-D tensor.");
-    auto dim_bias_v = context->GetInputDim("BiasV");
-    PADDLE_ENFORCE_GT(dim_bias_v.size(), 0,
-                      "Multihead input should be at least 1-D tensor.");
-
-    PADDLE_ENFORCE_EQ(dim_bias_q[0], dim_bias_k[0],
-                      "Multihead input bias should have same batch size");
-    PADDLE_ENFORCE_EQ(dim_bias_q[0], dim_bias_v[0],
-                      "Multihead input bias should have same batch size");
-
-    auto dim_bias_qk = context->GetInputDim("BiasQK");
-    PADDLE_ENFORCE_GT(dim_bias_qk.size(), 3,
-                      "Multihead input bias qk should be at least 4-D tensor.");
-
-    int b_indx = dim_bias_q.size() - 1;
-    int indx = dim_q.size() - 1;
-
    PADDLE_ENFORCE_EQ(
-        dim_bias_q[b_indx], dim_q[indx],
+        context->HasInput("Input"), true,
        platform::errors::InvalidArgument(
-            "bias_q's last dim size should equal to"
-            " q last dim size, but received bias_q's size is:%d q is:%d",
-            dim_bias_q[b_indx], dim_q[indx]));
+            "Input(Input) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(context->HasInput("W"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(W) of MultiHeadMatMul should not be null."));
    PADDLE_ENFORCE_EQ(
-        dim_bias_k[b_indx], dim_k[indx],
+        context->HasInput("Bias"), true,
        platform::errors::InvalidArgument(
-            "bias_k's last dim size should equal to"
-            " k last dim size, but received bias_k's size is:%d k is:%d",
-            dim_bias_k[b_indx], dim_k[indx]));
+            "Input(Bias) of MultiHeadMatMul should not be null."));
    PADDLE_ENFORCE_EQ(
-        dim_bias_v[b_indx], dim_v[indx],
+        context->HasInput("BiasQK"), true,
        platform::errors::InvalidArgument(
-            "bias_v's last dim size should equal to"
-            " v last dim size, but received bias_v's size is:%d v is:%d",
-            dim_bias_v[b_indx], dim_v[indx]));
+            "Input(BiasQK) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasOutput("Out"), true,
+        platform::errors::InvalidArgument(
+            "Output(Out) of MultiHeadMatMul should not be null."));

-    PADDLE_ENFORCE_EQ(dim_q[0], dim_bias_qk[0],
-                      platform::errors::InvalidArgument(
-                          "q should have same batch size"
-                          "with bias_qk, but received q's batch size is:%d "
-                          "bias_qk's batch size is:%d",
-                          dim_q[0], dim_bias_qk[0]));
+    auto dim_w = context->GetInputDim("W");
+    PADDLE_ENFORCE_GT(
+        dim_w.size(), 2,
+        platform::errors::InvalidArgument(
+            "Multihead input is expected at least a 3-D tensor, but "
+            "it's %d-D tensor now.",
+            dim_w.size()));

-    int head_number = context->Attrs().Get<int>("head_number");
-    PADDLE_ENFORCE_GT(head_number, 1,
-                      "Multihead input head number should be at least 1.");
+    auto dim_bias_q = context->GetInputDim("Bias");
+    PADDLE_ENFORCE_GT(
+        dim_bias_q.size(), 1,
+        platform::errors::InvalidArgument(
+            "Multihead input should be at least 2-D tensor, but it's "
+            "%d-D tensor now.",
+            dim_bias_q.size()));
+
+    auto dim_bias_qk = context->GetInputDim("BiasQK");
+    PADDLE_ENFORCE_GT(
+        dim_bias_qk.size(), 3,
+        platform::errors::InvalidArgument(
+            "Multihead input bias qk should be at least 4-D tensor, "
+            "but it's %d-D tensor now.",
+            dim_bias_qk.size()));

-    context->SetOutputDim("Out", dim_q);
-    context->ShareLoD("Q", /*->*/ "Out");
+    int head_number = context->Attrs().Get<int>("head_number");
+    PADDLE_ENFORCE_GT(
+        head_number, 1,
+        platform::errors::InvalidArgument(
+            "Multihead input head number should be at least 1, but it %d now.",
+            head_number));
+    // modify this
+    auto dim_input = context->GetInputDim("Input");
+    context->SetOutputDim("Out", dim_input);
+    context->ShareLoD("Input", /*->*/ "Out");
  }
 };

-class MultiHeadMatMulOpMaker : public framework::OpProtoAndCheckerMaker {
+class MultiHeadMatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("Q", "The first input of MultiHeadMatMul op");
-    AddInput("K", "The second input of MMultiHeadMatMul op");
-    AddInput("V", "The third input of MultiHeadMatMul op");
-    AddInput("BiasQ", "The first bias input of MultiHeadMatMul op");
-    AddInput("BiasK", "The second bias input of MultiHeadMatMul op");
-    AddInput("BiasV", "The third  bias input of MultiHeadMatMul op");
+    AddInput("Input", "The input of MultiHeadMatMul op");
+    AddInput("W", "The weight input of MultiHeadMatMul op");
+    AddInput("Bias", "The bias input of MultiHeadMatMul op");
    AddInput("BiasQK", "The QK bias input of MultiHeadMatMul op");
    AddOutput("Out", "The output of MultiHeadMatMul op");
    AddAttr<bool>("transpose_Q",
@ -161,10 +115,6 @@ Not suggest to use in other case except has same structure as ernie.
 Example of matrix multiplication with head_number of B
 - X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]

-Both the input `Q` and `K` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `Q`, because
-they are the same.
-
 )DOC");
  }
 };
@ -173,5 +123,5 @@ they are the same.
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(multihead_matmul, ops::MultiHeadMatMulOp,
-                             ops::MultiHeadMatMulOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(multihead_matmul, ops::MultiHeadMatMulV2Op,
+                             ops::MultiHeadMatMulV2OpMaker);
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
--- a/python/paddle/fluid/tests/unittests/test_fused_multihead_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multihead_matmul_op.py
@ -47,12 +47,21 @@ class TestFusedMultiheadMatmulOp(OpTest):
        self.config()
        h = self.seq_len
        w = self.head_number * self.size_per_head
-        self.Q = np.random.random((self.batch_size, h, w)).astype("float32")
-        self.K = np.random.random((self.batch_size, h, w)).astype("float32")
-        self.V = np.random.random((self.batch_size, h, w)).astype("float32")
+        self.Input = np.random.random(
+            (self.batch_size, h, w)).astype("float32") - 0.5
+        self.WQ = np.random.random((w, w)).astype("float32")
+        self.KQ = np.random.random((w, w)).astype("float32")
+        self.VQ = np.random.random((w, w)).astype("float32")
+        self.CombinedW = np.hstack((self.WQ, self.KQ, self.VQ)).reshape(
+            (w, 3, w))
+        self.Q = np.dot(self.Input, self.WQ)
+        self.K = np.dot(self.Input, self.KQ)
+        self.V = np.dot(self.Input, self.VQ)
+
        self.BiasQ = np.random.random((1, w)).astype("float32")
        self.BiasK = np.random.random((1, w)).astype("float32")
        self.BiasV = np.random.random((1, w)).astype("float32")
+        self.CombinedB = np.vstack((self.BiasQ, self.BiasK, self.BiasV))
        self.BiasQK = np.random.random(
            (self.batch_size, self.head_number, self.seq_len,
             self.seq_len)).astype("float32")
@ -84,12 +93,9 @@ class TestFusedMultiheadMatmulOp(OpTest):
        reshape_qkv = np.reshape(transpose_qkv, (self.batch_size, h, w))

        self.inputs = {
-            "Q": self.Q,
-            "K": self.K,
-            "V": self.V,
-            "BiasQ": self.BiasQ,
-            "BiasK": self.BiasK,
-            "BiasV": self.BiasV,
+            "Input": self.Input,
+            "W": self.CombinedW,
+            "Bias": self.CombinedB,
            "BiasQK": self.BiasQK
        }
        self.attrs = {