update embedding_eltwise_layernorm fuse and kernel. test=develop (#23114)

update embedding_eltwise_layernorm fuse pass and fused kernel, to support multi input
5 years ago · 95b356a069
parent a31d7328b7
commit 95b356a069
7 changed files with 585 additions and 320 deletions
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -130,6 +130,7 @@ cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DE
 cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass)
 cc_test(test_conv_bn_fuse_pass SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass)
 if(WITH_GPU)
+    cc_test(test_embedding_eltwise_layernorm_fuse_pass SRCS embedding_eltwise_layernorm_fuse_pass_tester.cc DEPS embedding_eltwise_layernorm_fuse_pass)
    cc_test(test_cudnn_placement_pass SRCS cudnn_placement_pass_tester.cc DEPS cudnn_placement_pass)
 endif()
 if(NOT WIN32)
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h
@ -16,6 +16,7 @@

 #include <memory>
 #include <string>
+#include <utility>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@ -25,35 +26,76 @@ namespace framework {
 namespace ir {
 namespace patterns {

-struct EmbeddingEltwiseLayerNormPattern : public PatternBase {
-  EmbeddingEltwiseLayerNormPattern(PDPattern* pattern,
-                                   const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "embedding_eltwise_layernorm") {}
+// detect start pattern.
+//
+//     in_var  emb       in_var   emb
+//       |      |          |       |
+//     lookup_table      lookup_table
+//           |                 |
+//        lkt_var           lkt_var
+//            \                /
+//             elementwise_add
+//                    |
+//               elt_out_var
+//
+struct Embedding2Eltwise1Pattern : public PatternBase {
+  Embedding2Eltwise1Pattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "embedding2_eltwise1") {}

-  PDNode* operator()();
+  void operator()();

  PATTERN_DECL_NODE(lookup_table1_x);
  PATTERN_DECL_NODE(lookup_table2_x);
-  PATTERN_DECL_NODE(lookup_table3_x);
-
  PATTERN_DECL_NODE(lookup_table1_w);
  PATTERN_DECL_NODE(lookup_table2_w);
-  PATTERN_DECL_NODE(lookup_table3_w);
-
  PATTERN_DECL_NODE(lookup_table1);
  PATTERN_DECL_NODE(lookup_table2);
-  PATTERN_DECL_NODE(lookup_table3);
-
  PATTERN_DECL_NODE(lookup_table1_out);
  PATTERN_DECL_NODE(lookup_table2_out);
-  PATTERN_DECL_NODE(lookup_table3_out);
-
-  PATTERN_DECL_NODE(eltwise_add_12);
-  PATTERN_DECL_NODE(eltwise_add_12_out);
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
+};

+// detect repeats inner pattern
+//
+//    elt_out_var            in_var   emb
+//         \                   |       |
+//          \                 lookup_table
+//           \                     |
+//            \                 lkt_var
+//             \                   /
+//                elementwise_add
+//                      |
+//                 elt_out_var
+//
+struct Embedding1Eltwise1Pattern : public PatternBase {
+  Embedding1Eltwise1Pattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "embedding1_eltwise1") {}
+  void operator()();
+  PATTERN_DECL_NODE(lookup_table1_x);
+  PATTERN_DECL_NODE(lookup_table1_w);
+  PATTERN_DECL_NODE(lookup_table1);
+  PATTERN_DECL_NODE(lookup_table1_out);
+  PATTERN_DECL_NODE(eltwise_add_in);
  PATTERN_DECL_NODE(eltwise_add);
  PATTERN_DECL_NODE(eltwise_add_out);
+};

+// detect end pattern
+//
+//     elementwise_add
+//            |
+//       elt_out_var
+//  scale     |       bias
+//    \       |        /
+//       layer_norm
+//
+struct SkipLayerNorm : public PatternBase {
+  SkipLayerNorm(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "skip_layernorm") {}
+  void operator()();
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
  PATTERN_DECL_NODE(layer_norm);
  PATTERN_DECL_NODE(layer_norm_bias);
  PATTERN_DECL_NODE(layer_norm_scale);
@ -79,6 +121,23 @@ struct EmbeddingEltwiseLayerNormPattern : public PatternBase {
 //
 // (word, pos, sent, weights_0, weights_1, weights_2,
 //       scale, baias)   embedding_eltwise_layernorm -> layer_norm_out
+//
+//
+//  in_var  emb_var   in_var   emb_var   in_var   emb_var      in_var   emb_var
+//    |        |        |         |        |         |           |         |
+//   lookup_table      lookup_table       lookup_table   ...    lookup_table
+//        |                 |                  |                     |
+//     lkt_var           lkt_var            lkt_var               lkt_var
+//        \                 /                  |         ...         |
+//          elementwise_add                    |                     |
+//                 \                          /                      |
+//                       elementwise_add                             |
+//                               |                                   |
+//                            elt_var                               /
+//                               \                                 /
+//                                         elementwise_add
+//                                                 |
+//                                            layer_norm

 class EmbeddingEltwiseLayerNormFusePass : public FusePassBase {
 public:
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
@ -0,0 +1,98 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+TEST(SkipLayerNormFusePass, basic) {
+  // inputs                           operator            output
+  // --------------------------------------------------------------------
+  // (x, y)                       elementwise_add    -> elementwise_out
+  // (elementwise_out, scale, bias) layer_norm       -> layer_norm_out...
+  Layers layers;
+  auto* x0 = layers.data("x0", {1, 256, 1});
+  auto* x1 = layers.data("x1", {1, 256, 1});
+  auto* x2 = layers.data("x2", {1, 256, 1});
+  auto* x3 = layers.data("x3", {1, 256, 1});
+
+  auto* emb0 = layers.data("emb0", {18000, 768}, true);
+  auto* emb1 = layers.data("emb1", {4, 768}, true);
+  auto* emb2 = layers.data("emb2", {513, 768}, true);
+  auto* emb3 = layers.data("emb3", {3, 768}, true);
+
+  auto* lkt0 = layers.embedding(x0, emb0);
+  auto* lkt1 = layers.embedding(x1, emb1);
+  auto* lkt2 = layers.embedding(x2, emb2);
+  auto* lkt3 = layers.embedding(x3, emb3);
+
+  auto* elementwise_out1 = layers.elementwise_add(lkt0, lkt2);
+  auto* elementwise_out2 = layers.elementwise_add(elementwise_out1, lkt1);
+  auto* elementwise_out3 = layers.elementwise_add(elementwise_out2, lkt3);
+
+  auto* scale = layers.data("scale", {768}, true);
+  auto* bias = layers.data("bias", {768}, true);
+  layers.layer_norm(elementwise_out3, scale, bias);
+
+  auto* y0 = layers.data("y0", {1, 256, 1});
+  auto* y1 = layers.data("y1", {1, 256, 1});
+  auto* y2 = layers.data("y2", {1, 256, 1});
+
+  auto* emb0y = layers.data("emb0y", {18000, 768}, true);
+  auto* emb1y = layers.data("emb1y", {4, 768}, true);
+  auto* emb2y = layers.data("emb2y", {513, 768}, true);
+
+  auto* lkt0y = layers.embedding(y0, emb0y);
+  auto* lkt1y = layers.embedding(y1, emb1y);
+  auto* lkt2y = layers.embedding(y2, emb2y);
+
+  auto* elementwise_out1y = layers.elementwise_add(lkt0y, lkt2y);
+  auto* elementwise_out2y = layers.elementwise_add(elementwise_out1y, lkt1y);
+
+  auto* scaley = layers.data("scaley", {768}, true);
+  auto* biasy = layers.data("biasy", {768}, true);
+  layers.layer_norm(elementwise_out2y, scaley, biasy);
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  auto pass =
+      PassRegistry::Instance().Get("embedding_eltwise_layernorm_fuse_pass");
+  int num_nodes_before = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+
+  graph.reset(pass->Apply(graph.release()));
+  int num_nodes_after = graph->Nodes().size();
+  int num_fused_nodes_after =
+      GetNumOpNodes(graph, "fused_embedding_eltwise_layernorm");
+  VLOG(3) << DebugString(graph);
+
+  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 28,
+                    platform::errors::PreconditionNotMet(
+                        "The number of nodes before and after the fuse does "
+                        "not meet expectations"));
+  PADDLE_ENFORCE_EQ(
+      num_fused_nodes_after, 2,
+      platform::errors::PreconditionNotMet(
+          "The number of fusion nodes does not meet expectations after fuse"));
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(embedding_eltwise_layernorm_fuse_pass);
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
@ -26,33 +26,19 @@ class EmbeddingEltWiseLayerNormOp : public framework::OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE_EQ(context->HasInput("WordId"), true,
+    PADDLE_ENFORCE_EQ(context->Inputs("Ids").size(),
+                      context->Inputs("Embs").size(),
                      platform::errors::InvalidArgument(
-                          "Input(WordId) of EmbeddingEltWiseLayerNormOp should "
-                          "not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        context->HasInput("PosId"), true,
-        platform::errors::InvalidArgument(
-            "Input(PosId) of EmbeddingEltWiseLayerNormOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(context->HasInput("SentId"), true,
+                          "Two inputs of EmbeddingEltWiseLayerNormOp shoube be "
+                          "the same size"));
+    PADDLE_ENFORCE_GE(context->Inputs("Embs").size(), 2UL,
                      platform::errors::InvalidArgument(
-                          "Input(SentId) of EmbeddingEltWiseLayerNormOp should "
-                          "not be null."));
-
-    PADDLE_ENFORCE_EQ(context->HasInput("WordEmb"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(WordEmb) of EmbeddingEltWiseLayerNormOp "
-                          "should not be null."));
-    PADDLE_ENFORCE_EQ(context->HasInput("PosEmb"), true,
+                          "Input Embs of EmbeddingEltWiseLayerNormOp should "
+                          "have at least 2 tensors"));
+    PADDLE_ENFORCE_GE(context->Inputs("Ids").size(), 2UL,
                      platform::errors::InvalidArgument(
-                          "Input(PosEmb) of EmbeddingEltWiseLayerNormOp should "
-                          "not be null."));
-    PADDLE_ENFORCE_EQ(context->HasInput("SentEmb"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(SentEmb) of EmbeddingEltWiseLayerNormOp "
-                          "should not be null."));
+                          "Input Ids of EmbeddingEltWiseLayerNormOp should "
+                          "have at least 2 tensors"));

    PADDLE_ENFORCE_EQ(
        context->HasInput("Bias"), true,
@ -70,55 +56,55 @@ class EmbeddingEltWiseLayerNormOp : public framework::OperatorWithKernel {
            "Output(Out) of EmbeddingEltWiseLayerNormOp should not be null."));

    // batch * seq_len * 1
-    auto dims_word_id = context->GetInputDim("WordId");
+    auto ids_dims = context->GetInputsDim("Ids");
    // word_num * hidden
-    auto dims_word_emb = context->GetInputDim("WordEmb");
-    auto dims_pos_emb = context->GetInputDim("PosEmb");
-    auto dims_sent_emb = context->GetInputDim("SentEmb");
+    auto embs_dims = context->GetInputsDim("Embs");
    // hidden
    auto dims_bias = context->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(
-        dims_word_emb[1], dims_bias[0],
-        platform::errors::InvalidArgument(
-            "The second dims (%d) of the Word Embedding should be equal "
-            "to the Bias's size(%d).",
-            dims_word_emb[1], dims_bias[0]));
-    PADDLE_ENFORCE_EQ(dims_word_emb.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "The WordEmb dim's size shoule be 2, but found %d.",
-                          dims_word_emb.size()));
-    PADDLE_ENFORCE_EQ(dims_pos_emb.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "The PosEmb dim's size shoule be 2, but found %d.",
-                          dims_pos_emb.size()));
-    PADDLE_ENFORCE_EQ(dims_sent_emb.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "The SentEmb dim's size shoule be 2, but found %d.",
-                          dims_sent_emb.size()));
-    PADDLE_ENFORCE_EQ(
-        dims_word_emb[1], dims_pos_emb[1],
-        platform::errors::InvalidArgument(
-            "The WordEmb first dim size(%d) shoule equal to PosEmb ones(%d).",
-            dims_word_emb[1], dims_pos_emb[1]));
-    PADDLE_ENFORCE_EQ(
-        dims_word_emb[1], dims_sent_emb[1],
-        platform::errors::InvalidArgument(
-            "The WordEmb first dim size(%d) shoule equal to SentEmb ones(%d).",
-            dims_word_emb[1], dims_sent_emb[1]));
+    int batch = ids_dims[0][0];
+    int seq_len = ids_dims[0][1];
+    int hidden = embs_dims[0][1];
+    for (size_t i = 0; i < embs_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(embs_dims[i].size(), 2,
+                        platform::errors::InvalidArgument(
+                            "The Emb dim's size shoule be 2, but found %d.",
+                            embs_dims[i].size()));
+      PADDLE_ENFORCE_EQ(
+          embs_dims[i][1], dims_bias[0],
+          platform::errors::InvalidArgument(
+              "The second dims (%d) of the Embedding should be equal "
+              "to the Bias's size(%d).",
+              embs_dims[i][1], dims_bias[0]));
+      PADDLE_ENFORCE_EQ(
+          embs_dims[i][1], hidden,
+          platform::errors::InvalidArgument(
+              "The Emb first dim size(%d) shoule equal to hidden (%d).",
+              embs_dims[i][1], hidden));
+    }

-    int batch = dims_word_id[0];
-    int seq_len = dims_word_id[1];
-    int hidden = dims_word_emb[1];
    auto dim_output = framework::make_ddim({batch, seq_len, hidden});
    context->SetOutputDim("Out", dim_output);
-    context->ShareLoD("WordId", /*->*/ "Out");
+    context->ShareLoD("Ids", /*->*/ "Out");
  }

 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "WordEmb");
-    return framework::OpKernelType(data_type, ctx.device_context());
+    auto inputs = ctx.MultiInput<framework::Tensor>("Embs");
+    auto input_data_type = framework::proto::VarType::Type(0);
+    bool flag = 0;
+    for (auto* input : inputs) {
+      if (input->IsInitialized() && input->numel() > 0) {
+        input_data_type = input->type();
+        flag = 1;
+        break;
+      }
+    }
+    if (flag == 0) {
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "All Inputs of fused_embedding_eltwise_layernorm OP are Empty!"));
+    }
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
 };

@ -126,15 +112,10 @@ class EmbeddingEltWiseLayerNormOpMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("WordId", "The word id input of EmbeddingEltWiseLayerNorm op");
-    AddInput("PosId", "The position id input of EmbeddingEltWiseLayerNorm op");
-    AddInput("SentId", "The sentence id input of EmbeddingEltWiseLayerNorm op");
-    AddInput("WordEmb",
-             "The Word embedding input of EmbeddingEltWiseLayerNorm op");
-    AddInput("PosEmb",
-             "The Position embedding input of EmbeddingEltWiseLayerNorm op");
-    AddInput("SentEmb",
-             "The Sent embedding input of EmbeddingEltWiseLayerNorm op");
+    AddInput("Ids", "Input id tensors of EmbeddingEltWiseLayerNorm op")
+        .AsDuplicable();
+    AddInput("Embs", "Input emb tensors of EmbeddingEltWiseLayerNorm op")
+        .AsDuplicable();
    AddInput("Bias", "The LayerNorm Bias of EmbeddingEltWiseLayerNorm op");
    AddInput("Scale", "The LayerNorm Scale of EmbeddingEltWiseLayerNorm op");
    AddOutput("Out", "The output of EmbeddingEltWiseLayerNorm op");
@ -157,10 +138,11 @@ class EmbeddingEltWiseLayerNormOpMaker
 EmbeddingEltWiseLayerNorm Operator.

 This op is used for optimize the following structure in ernie model.
-wordid -> lookup_table_op -> word
-posid -> lookup_table_op -> pos
-sentdid -> lookup_table_op -> sent
-word + pos + sent -> Y
+id1 -> lookup_table_op -> data1
+id2 -> lookup_table_op -> data2
+           ...
+idn -> lookup_table_op -> data_n
+data1 + data2 + ... + data_n -> Y
 Y -> layer_norm -> Out

 Not suggest to use in other case except has same structure as ernie.
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@ -16,6 +16,7 @@
 #include <paddle/fluid/platform/device_context.h>
 #include <algorithm>
 #include <cub/cub.cuh>  // NOLINT
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
@ -57,32 +58,28 @@ __device__ inline void LayerNorm(const cv2<T> &thread_data, const int ld,
 }

 template <typename T, unsigned TPB>
-__global__ void EmbEltwiseLayernormKernel(
-    int hidden, const int64_t *word_id_d, const int64_t *pos_id_d,
-    const int64_t *sent_id_d, const T *scale, const T *bias, const T *word_emb,
-    const T *pos_emb, const T *sent_emb, T *output, float eps) {
+__global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids,
+                                          const T *scale, const T *bias,
+                                          const int64_t *embs, T *output,
+                                          float eps, int input_num) {
  cub::Sum pair_sum;
  // blockIdx.x: position in the sequence
  // blockIdx.y: batch
  // gridDim.x: Seq
  // gridDim.y: Batch
-  __shared__ int64_t word_id;
-  __shared__ int64_t pos_id;
-  __shared__ int64_t sent_id;
+
+  extern __shared__ int64_t array_id[];

  const T rhidden = T(1.f) / T(hidden);
  const int64_t seq_pos = blockIdx.y + blockIdx.x * gridDim.y;
  if (threadIdx.x == 0) {
-    word_id = word_id_d[seq_pos];
-    pos_id = pos_id_d[seq_pos];
-    sent_id = sent_id_d[seq_pos];
+    for (int i = 0; i < input_num; ++i) {
+      const int64_t *ids_p = reinterpret_cast<const int64_t *>(ids[i]);
+      array_id[i] = ids_p[seq_pos];
+    }
  }
  __syncthreads();

-  // load word, pos, sentence embeddings and add them toghether
-  const int64_t woffset = word_id * hidden;
-  const int64_t poffset = pos_id * hidden;
-  const int64_t soffset = sent_id * hidden;
  const int64_t out_offset = seq_pos * hidden;

  cv2<T> thread_data;
@ -91,10 +88,10 @@ __global__ void EmbEltwiseLayernormKernel(

 #pragma unroll
  for (int it = threadIdx.x; it < hidden; it += TPB) {
-    const T w(word_emb[woffset + it]);
-    const T p(pos_emb[poffset + it]);
-    const T s(sent_emb[soffset + it]);
-    const T val = w + s + p;
+    T val = 0;
+    for (int i = 0; i < input_num; ++i) {
+      val += reinterpret_cast<const T *>(embs[i])[array_id[i] * hidden + it];
+    }

    output[out_offset + it] = val;
    const T rhiddenval = rhidden * val;
@ -112,47 +109,58 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
    using Tensor = framework::Tensor;
-    auto *word_id = context.Input<framework::Tensor>("WordId");
-    auto *pos_id = context.Input<framework::Tensor>("PosId");
-    auto *sent_id = context.Input<framework::Tensor>("SentId");
-
-    auto *word_emb = context.Input<framework::Tensor>("WordEmb");
-    auto *pos_emb = context.Input<framework::Tensor>("PosEmb");
-    auto *sent_emb = context.Input<framework::Tensor>("SentEmb");
+    auto &device_ctx = context.template device_context<DeviceContext>();
+    auto ids = context.MultiInput<framework::Tensor>("Ids");
+    auto embs = context.MultiInput<framework::Tensor>("Embs");
+    int input_num = static_cast<int>(ids.size());
+
+    framework::Tensor in_ids_(framework::proto::VarType::INT64),
+        in_embs_(framework::proto::VarType::INT64);
+    framework::DDim in_dim{input_num};
+    int device_id;
+    cudaGetDevice(&device_id);
+    in_ids_.Resize(in_dim);
+    in_embs_.Resize(in_dim);
+    int64_t *in_ids_d =
+        in_ids_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
+    int64_t *in_embs_d =
+        in_embs_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
+
+    std::vector<int64_t> in1s, in2s;
+    for (int i = 0; i < input_num; ++i) {
+      in1s.push_back(reinterpret_cast<uintptr_t>(ids[i]->data<int64_t>()));
+      in2s.push_back(reinterpret_cast<uintptr_t>(embs[i]->data<T>()));
+    }
+
+    cudaMemcpyAsync(in_ids_d, in1s.data(), sizeof(int64_t) * input_num,
+                    cudaMemcpyHostToDevice, device_ctx.stream());
+    cudaMemcpyAsync(in_embs_d, in2s.data(), sizeof(int64_t) * input_num,
+                    cudaMemcpyHostToDevice, device_ctx.stream());

    auto *bias = context.Input<framework::Tensor>("Bias");
    auto *scale = context.Input<framework::Tensor>("Scale");
    auto *out = context.Output<framework::Tensor>("Out");

-    auto *word_id_d = word_id->data<int64_t>();
-    auto *pos_id_d = pos_id->data<int64_t>();
-    auto *sent_id_d = sent_id->data<int64_t>();
+    // should be (B * S * hidden)
+    auto id0_dims = ids[0]->dims();
+    auto emb0_dims = embs[0]->dims();

-    auto *word_emb_d = word_emb->data<T>();
-    auto *pos_emb_d = pos_emb->data<T>();
-    auto *sent_emb_d = sent_emb->data<T>();
+    int batch = id0_dims[0];
+    int seq_len = id0_dims[1];
+    int hidden = emb0_dims[1];

    auto *bias_d = bias->data<T>();
    auto *scale_d = scale->data<T>();
    auto *output_d = out->mutable_data<T>(context.GetPlace());
-    // compute q*k with eltadd
-    auto &device_ctx = context.template device_context<DeviceContext>();
    float eps = context.Attr<float>("epsilon");

-    // should be (B * S * hidden)
-    auto word_id_dims = word_id->dims();
-    auto word_emb_dims = word_emb->dims();
-
-    int batch = word_id_dims[0];
-    int seq_len = word_id_dims[1];
-    int hidden = word_emb_dims[1];
-
    const unsigned tpb = 256;
    const dim3 grid(seq_len, batch, 1);
    const dim3 block(tpb, 1, 1);
-    EmbEltwiseLayernormKernel<T, tpb><<<grid, block, 0, device_ctx.stream()>>>(
-        hidden, word_id_d, pos_id_d, sent_id_d, scale_d, bias_d, word_emb_d,
-        pos_emb_d, sent_emb_d, output_d, eps);
+    int shared_bytes = input_num * sizeof(int64_t);
+    EmbEltwiseLayernormKernel<
+        T, tpb><<<grid, block, shared_bytes, device_ctx.stream()>>>(
+        hidden, in_ids_d, scale_d, bias_d, in_embs_d, output_d, eps, input_num);
  }
 };

--- a/python/paddle/fluid/tests/unittests/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
@ -48,6 +48,39 @@ class EmbEltwiseLayerNormFusePassTest(PassTest):
            add2 = fluid.layers.elementwise_add(add1, sent_emb)
            hidden1 = fluid.layers.layer_norm(input=add2, begin_norm_axis=2)

+            id1 = fluid.layers.data(
+                name="id1",
+                shape=[1, 128, 1],
+                dtype="int64",
+                append_batch_size=False)
+            id2 = fluid.layers.data(
+                name="id2",
+                shape=[1, 128, 1],
+                dtype="int64",
+                append_batch_size=False)
+            id3 = fluid.layers.data(
+                name="id3",
+                shape=[1, 128, 1],
+                dtype="int64",
+                append_batch_size=False)
+            id4 = fluid.layers.data(
+                name="id4",
+                shape=[1, 128, 1],
+                dtype="int64",
+                append_batch_size=False)
+            emb1 = fluid.layers.embedding(
+                input=id1, size=(128, 768), dtype='float32')
+            emb2 = fluid.layers.embedding(
+                input=id2, size=(128, 768), dtype='float32')
+            emb3 = fluid.layers.embedding(
+                input=id3, size=(128, 768), dtype='float32')
+            emb4 = fluid.layers.embedding(
+                input=id4, size=(128, 768), dtype='float32')
+            add_1 = fluid.layers.elementwise_add(emb1, emb2)
+            add_2 = fluid.layers.elementwise_add(add_1, emb3)
+            add_3 = fluid.layers.elementwise_add(add_2, emb4)
+            hidden_1 = fluid.layers.layer_norm(input=add_3, begin_norm_axis=2)
+
        self.feeds = {
            "word_id": np.random.randint(
                low=0, high=128, size=(1, 128, 1)).astype("int64"),
@ -55,11 +88,19 @@ class EmbEltwiseLayerNormFusePassTest(PassTest):
                low=0, high=128, size=(1, 128, 1)).astype("int64"),
            "sent_id": np.random.randint(
                low=0, high=128, size=(1, 128, 1)).astype("int64"),
+            "id1": np.random.randint(
+                low=0, high=128, size=(1, 128, 1)).astype("int64"),
+            "id2": np.random.randint(
+                low=0, high=128, size=(1, 128, 1)).astype("int64"),
+            "id3": np.random.randint(
+                low=0, high=128, size=(1, 128, 1)).astype("int64"),
+            "id4": np.random.randint(
+                low=0, high=128, size=(1, 128, 1)).astype("int64"),
        }
-        self.fetch_list = [hidden1]
+        self.fetch_list = [hidden1, hidden_1]
        self.pass_names = "embedding_eltwise_layernorm_fuse_pass"
        self.fused_op_type = "fused_embedding_eltwise_layernorm"
-        self.num_fused_ops = 1
+        self.num_fused_ops = 2

    def test_check_output(self):
        use_gpu_set = [True]