[Paddle-TRT]: Ernie Dynamic shape support. (#23138)

* add dynamic plugin support. test=develop * change emb eltwise layernorm to math function test=develop * add emb eltwise layernorm test=develop * can run dynamic shape ernie test=develop * fix ci test=develop * add ut for trt ernie dynamic test=develop * refine dynamic shape c++ interface. test=develop * fix comments test=develop * fix comments test=develop
5 years ago · 430b0099c9
parent d0413e58d3
commit 430b0099c9
37 changed files with 2298 additions and 765 deletions
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -101,8 +101,10 @@ function(select_nvcc_arch_flags out_variable)
  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
    set(cuda_arch_bin "60 61")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    add_definitions("-DSUPPORTS_CUDA_FP16")
    set(cuda_arch_bin "70")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
+    add_definitions("-DSUPPORTS_CUDA_FP16")
    set(cuda_arch_bin "75")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@ -176,10 +176,14 @@ struct Argument {
  DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);

-  // usually use for trt dynamic shape.
+  // Usually use for trt dynamic shape.
+  // TRT will select the best kernel according to opt shape
+  // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
+  // run fp16.
  DECL_ARGUMENT_FIELD(min_input_shape, MinInputShape, input_shape_t);
  DECL_ARGUMENT_FIELD(max_input_shape, MaxInputShape, input_shape_t);
  DECL_ARGUMENT_FIELD(optim_input_shape, OptimInputShape, input_shape_t);
+  DECL_ARGUMENT_FIELD(disable_trt_plugin_fp16, CloseTrtPluginFp16, bool);

  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@ -130,6 +130,11 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("optim_input_shape",
                new std::map<std::string, std::vector<int>>(
                    argument->optim_input_shape()));
+      // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
+      // not
+      // run fp16.
+      pass->Set("disable_trt_plugin_fp16",
+                new bool(argument->disable_trt_plugin_fp16()));
    }
    if (pass_name == "ngraph_subgraph_pass") {
      pass->Set("program",
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@ -272,7 +272,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  // Check trt version for dynamic shape input.

  if (min_input_shape.size() > 0 && TRT_VERSION < 6000) {
-    std::cout << "hello";
    LOG_FIRST_N(WARNING, 1) << "You are using the dynamic size input mode of "
                               "Paddle-TRT, but we found that the version of "
                               "the TensorRT is less than 6.0, so we use the "
@ -284,18 +283,23 @@ void TensorRtSubgraphPass::CreateTensorRTOp(

  if (min_input_shape.size() > 0 && TRT_VERSION > 6000) {
    LOG_FIRST_N(WARNING, 1)
-        << "The Paddle lib links the " << TRT_VERSION / 1000.
-        << " version TensorRT, "
+        << "The Paddle lib links the " << TRT_VERSION << " version TensorRT, "
        << "make sure the runtime TensorRT you are using is no less than this "
           "version, otherwise, there might be Segfault!";
  }

+  // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
+  // run fp16.
+  // When running fp16, the output accuracy of the model will be affected,
+  // closing the plugin fp16 may bring some improvement on accuracy.
+  bool disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
  tensorrt::TensorRTEngine *trt_engine =
      inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
          .Create(engine_key + std::to_string(predictor_id),
                  Get<int>("max_batch_size"), Get<int>("workspace_size"),
                  precision_mode, calibrator.get(), Get<int>("gpu_device_id"),
-                  min_input_shape, max_input_shape, opt_input_shape);
+                  min_input_shape, max_input_shape, opt_input_shape,
+                  disable_trt_plugin_fp16);

  bool need_serialize = (use_static_engine && !load_from_memory);
  if (need_serialize) {
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@ -128,6 +128,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(min_input_shape_);
  CP_MEMBER(max_input_shape_);
  CP_MEMBER(optim_input_shape_);
+  CP_MEMBER(disable_trt_plugin_fp16_);

  CP_MEMBER(use_lite_);
  CP_MEMBER(lite_precision_mode_);
@ -226,10 +227,7 @@ MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
 void AnalysisConfig::EnableTensorRtEngine(
    int workspace_size, int max_batch_size, int min_subgraph_size,
    AnalysisConfig::Precision precision_mode, bool use_static,
-    bool use_calib_mode,
-    std::map<std::string, std::vector<int>> min_input_shape,
-    std::map<std::string, std::vector<int>> max_input_shape,
-    std::map<std::string, std::vector<int>> optim_input_shape) {
+    bool use_calib_mode) {
 #ifdef PADDLE_WITH_CUDA
  if (!use_gpu()) {
    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@ -243,9 +241,6 @@ void AnalysisConfig::EnableTensorRtEngine(
  tensorrt_precision_mode_ = precision_mode;
  trt_use_static_engine_ = use_static;
  trt_use_calib_mode_ = use_calib_mode;
-  min_input_shape_ = min_input_shape;
-  max_input_shape_ = max_input_shape;
-  optim_input_shape_ = optim_input_shape;

  Update();
 #else
@ -254,6 +249,17 @@ void AnalysisConfig::EnableTensorRtEngine(
 #endif
 }

+void AnalysisConfig::SetTRTDynamicShapeInfo(
+    std::map<std::string, std::vector<int>> min_input_shape,
+    std::map<std::string, std::vector<int>> max_input_shape,
+    std::map<std::string, std::vector<int>> optim_input_shape,
+    bool disable_trt_plugin_fp16) {
+  min_input_shape_ = min_input_shape;
+  max_input_shape_ = max_input_shape;
+  optim_input_shape_ = optim_input_shape;
+  disable_trt_plugin_fp16_ = disable_trt_plugin_fp16;
+}
+
 // TODO(Superjomn) refactor this, buggy.
 void AnalysisConfig::Update() {
  auto info = SerializeInfoCache();
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@ -428,6 +428,7 @@ void AnalysisPredictor::PrepareArgument() {
    argument_.SetMinInputShape(config_.min_input_shape_);
    argument_.SetMaxInputShape(config_.max_input_shape_);
    argument_.SetOptimInputShape(config_.optim_input_shape_);
+    argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
  }

  if (config_.lite_engine_enabled()) {
@ -951,4 +952,6 @@ USE_TRT_CONVERTER(instance_norm);
 USE_TRT_CONVERTER(layer_norm);
 USE_TRT_CONVERTER(gelu);
 USE_TRT_CONVERTER(multihead_matmul);
+USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
+USE_TRT_CONVERTER(skip_layernorm);
 #endif
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@ -222,16 +222,29 @@ struct AnalysisConfig {
   * @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a
   * subgraph is less than this, it will not transfer to TensorRT engine.
   */
-  void EnableTensorRtEngine(
-      int workspace_size = 1 << 20, int max_batch_size = 1,
-      int min_subgraph_size = 3, Precision precision = Precision::kFloat32,
-      bool use_static = false, bool use_calib_mode = true,
-      std::map<std::string, std::vector<int>> min_input_shape = {},
-      std::map<std::string, std::vector<int>> max_input_shape = {},
-      std::map<std::string, std::vector<int>> optim_input_shape = {});
+  void EnableTensorRtEngine(int workspace_size = 1 << 20,
+                            int max_batch_size = 1, int min_subgraph_size = 3,
+                            Precision precision = Precision::kFloat32,
+                            bool use_static = false,
+                            bool use_calib_mode = true);
+
  /** A boolean state telling whether the TensorRT engine is used.
   */
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
+  /**
+   *  \brief Set min, max, opt shape for TensorRT Dynamic shape mode.
+   *  @param min_input_shape the min input shape of the subgraph input
+   *  @param max_input_shape the max input shape of the subgraph input
+   *  @param opt_input_shape the opt input shape of the subgraph input
+   *  @param disable_trt_plugin_fp16, setting this variable to true
+   *  means that TRT plugin will not run fp16
+   */
+  void SetTRTDynamicShapeInfo(
+      std::map<std::string, std::vector<int>> min_input_shape,
+      std::map<std::string, std::vector<int>> max_input_shape,
+      std::map<std::string, std::vector<int>> optim_input_shape,
+      bool disable_trt_plugin_fp16 = false);
+
  /**
   *  \brief Turn on the usage of Lite sub-graph engine.
   */
@ -386,6 +399,10 @@ struct AnalysisConfig {
  Precision tensorrt_precision_mode_;
  bool trt_use_static_engine_;
  bool trt_use_calib_mode_;
+  std::map<std::string, std::vector<int>> min_input_shape_{};
+  std::map<std::string, std::vector<int>> max_input_shape_{};
+  std::map<std::string, std::vector<int>> optim_input_shape_{};
+  bool disable_trt_plugin_fp16_{false};

  // memory reuse related.
  bool enable_memory_optim_{false};
@ -412,9 +429,6 @@ struct AnalysisConfig {
  std::string serialized_info_cache_;

  mutable std::unique_ptr<PassStrategy> pass_builder_;
-  std::map<std::string, std::vector<int>> min_input_shape_;
-  std::map<std::string, std::vector<int>> max_input_shape_;
-  std::map<std::string, std::vector<int>> optim_input_shape_;

  bool use_lite_{false};
  std::vector<std::string> lite_passes_filter_;
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@ -77,12 +77,14 @@ const std::vector<std::string> kTRTSubgraphPasses({
      "quant_conv2d_dequant_fuse_pass",            //
      "delete_quant_dequant_op_pass",              //
      // "fc_fuse_pass",                                 //
-      "simplify_with_basic_ops_pass",  //
-      "multihead_matmul_fuse_pass",    //
-      "conv_bn_fuse_pass",             //
-      "fc_fuse_pass",                  //
-      "tensorrt_subgraph_pass",        //
-      "conv_bn_fuse_pass",             //
+      "simplify_with_basic_ops_pass",           //
+      "embedding_eltwise_layernorm_fuse_pass",  //
+      "multihead_matmul_fuse_pass_v2",          //
+      "skip_layernorm_fuse_pass",               //
+      "conv_bn_fuse_pass",                      //
+      "fc_fuse_pass",                           //
+      "tensorrt_subgraph_pass",                 //
+      "conv_bn_fuse_pass",                      //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                           // guaranteed at least v7
      "conv_elementwise_add_act_fuse_pass",   //
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@ -3,7 +3,7 @@ nv_library(tensorrt_converter
           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc
+                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc
           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)

 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@ -0,0 +1,115 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class EmbEltwiseLayerNormOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+#if IS_TRT_VERSION_GE(6000)
+    VLOG(4) << "convert fluid swish op to tensorrt layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto id_names = op_desc.Input("Ids");
+    auto emb_names = op_desc.Input("Embs");
+
+    PADDLE_ENFORCE_EQ(id_names.size(), emb_names.size(),
+                      platform::errors::InvalidArgument(
+                          "The id and emb size of fused EmbEltwiseLayerNormOp "
+                          "should be same "));
+    int input_num = id_names.size();
+
+    // Declare inputs
+    std::vector<nvinfer1::ITensor*> input_ids;
+    for (int i = 0; i < input_num; i++) {
+      input_ids.push_back(engine_->GetITensor(id_names[i]));
+    }
+
+    std::vector<float*> input_embs;
+    std::vector<int> emb_sizes;
+
+    // get the presistable var's data
+    auto get_persistable_data = [&](const std::string& var_name,
+                                    framework::DDim* dims) -> float* {
+      auto* temp_var = scope.FindVar(var_name);
+      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+      (*dims) = temp_tensor->dims();
+
+      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false);
+      return temp_data;
+    };
+
+    int hidden = 0;
+    for (int i = 0; i < input_num; i++) {
+      framework::DDim emb_dims;
+      float* emb_data = get_persistable_data(emb_names[i], &emb_dims);
+      int64_t emb_size = framework::product(emb_dims);
+      input_embs.push_back(emb_data);
+      emb_sizes.push_back(emb_size);
+      PADDLE_ENFORCE_EQ(
+          emb_dims.size(), 2,
+          platform::errors::InvalidArgument(
+              "The fused EmbEltwiseLayerNorm's emb should be 2 dims."));
+      hidden = emb_dims[1];
+    }
+
+    framework::DDim bias_dims, scale_dims;
+
+    auto* bias =
+        get_persistable_data(op_desc.Input("Bias").front(), &bias_dims);
+    auto* scale =
+        get_persistable_data(op_desc.Input("Scale").front(), &scale_dims);
+    int64_t bias_size = framework::product(bias_dims);
+    int64_t scale_size = framework::product(scale_dims);
+    float eps = boost::get<float>(op_desc.GetAttr("epsilon"));
+    nvinfer1::ILayer* layer = nullptr;
+
+    if (engine_->with_dynamic_shape()) {
+      plugin::EmbEltwiseLayernormPluginDynamic* plugin =
+          new plugin::EmbEltwiseLayernormPluginDynamic(input_embs, bias, scale,
+                                                       emb_sizes, bias_size,
+                                                       scale_size, hidden, eps);
+      layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the Ernie(Bert) model in static"
+          "shape mode, which is not supported for the time being.\n"
+          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
+          " to set the shape information to run the dynamic shape mode."));
+    }
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
+                             test_mode);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "You are running the TRT Dynamic Shape mode, need to confirm that "
+        "your TRT version is no less than 6.0"));
+#endif
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(fused_embedding_eltwise_layernorm,
+                          EmbEltwiseLayerNormOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@ -18,32 +18,6 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

-// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
-// tensorflow.
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
-template <typename T>
-void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
-              T* odata, nvinfer1::DimsHW ostrides) {
-  for (int h = 0; h < shape.h(); ++h) {
-    for (int w = 0; w < shape.w(); ++w) {
-      odata[h * ostrides.h() + w * ostrides.w()] =
-          idata[h * istrides.h() + w * istrides.w()];
-    }
-  }
-}
-// indata c * k
-// Reorder the data layout from CK to KC.
-void ReorderCKtoKC(TensorRTEngine::Weight& iweights,  // NOLINT
-                   TensorRTEngine::Weight* oweights) {
-  int c = iweights.dims[0];
-  int k = iweights.dims[1];
-  oweights->dims.assign({k, c});
-  nvinfer1::DimsHW istrides = {1, k};
-  nvinfer1::DimsHW ostrides = {c, 1};
-  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
-           static_cast<float*>(const_cast<void*>(oweights->get().values)),
-           ostrides);
-}
 /*
 * FC converter convert a MUL op in Fluid to a FC layer in TRT.
 */
@ -64,7 +38,6 @@ class FcOpConverter : public OpConverter {
    }
    // Declare inputs
    auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
-
    // Declare weights
    auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
    PADDLE_ENFORCE_NOT_NULL(Y_v);
@ -101,28 +74,44 @@ class FcOpConverter : public OpConverter {
    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
    size_t n_output = Y_t->dims()[1];

-    std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor());
-    tmp->Resize(Y_t->dims());
+    int m = Y_t->dims()[0];
+    int n = Y_t->dims()[1];
+
+    auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
+      for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+          dst[j * m + i] = src[i * n + j];
+        }
+      }
+    };
+
+    auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
+                         TensorRTEngine::Weight& weight,
+                         TensorRTEngine::Weight& bias) {
+      auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs,
+                                            n_output, weight.get(), bias.get());
+
+      auto output_name = op_desc.Output("Out").front();
+      if (activation_type == "relu") {
+        nvinfer1::IActivationLayer* relu_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Activation, *(fc_layer->getOutput(0)),
+                                 nvinfer1::ActivationType::kRELU);
+        RreplenishLayerAndOutput(relu_layer, "fc", {output_name}, test_mode);
+      } else {
+        RreplenishLayerAndOutput(fc_layer, "fc", {output_name}, test_mode);
+      }
+    };
+
+    std::vector<float> weight_data_tmp;
+    weight_data_tmp.reserve(Y_t->numel());
+    memcpy(weight_data_tmp.data(), weight_data, Y_t->numel() * sizeof(float));
+    tranpose_weight(weight_data_tmp.data(), weight_data, m, n);

-    memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data,
-           Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                  static_cast<void*>(weight_data),
                                  static_cast<size_t>(Y_t->numel())};
-    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(tmp->data<float>()),
-                                      static_cast<size_t>(Y_t->numel()));
-    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
-    tmp_weight.dims = weight.dims;
-
-    // The data layout of TRT FC layer's weight is different from fluid's FC,
-    // need to reorder the elements.
-    ReorderCKtoKC(weight, &tmp_weight);
-
-    // Currently, the framework can only handle one fluid op -> one TRT layer,
-    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
-    // handle `mul`, leave `add` as another layer.
-    // DEBUG
+    weight.dims.assign({n, m});
+
    float* bias_data = nullptr;
    int bias_num = 0;
    if (with_bias) {
@ -136,6 +125,10 @@ class FcOpConverter : public OpConverter {
                                static_cast<void*>(bias_data),
                                static_cast<size_t>(bias_num)};

+    if (engine_->with_dynamic_shape()) {
+      regist_fc(X, n_output, weight, bias);
+      return;
+    }
    // in order to handle situations in NLP models(input dims < 3,
    // x_num_col_dims != 1, etc.), reshape input to perform FC correctly.
    auto* reshape_itensor = X;
@ -192,20 +185,7 @@ class FcOpConverter : public OpConverter {
      reshape_layer->setReshapeDimensions(reshape_dim);
      reshape_itensor = reshape_layer->getOutput(0);
    }
-    auto* fc_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *reshape_itensor,
-                             n_output, tmp_weight.get(), bias.get());
-
-    engine_->SetWeights(op_desc.Input(w_name).front(), std::move(tmp));
-    auto output_name = op_desc.Output("Out").front();
-    if (activation_type == "relu") {
-      nvinfer1::IActivationLayer* relu_layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Activation, *(fc_layer->getOutput(0)),
-                               nvinfer1::ActivationType::kRELU);
-      RreplenishLayerAndOutput(relu_layer, "fc", {output_name}, test_mode);
-    } else {
-      RreplenishLayerAndOutput(fc_layer, "fc", {output_name}, test_mode);
-    }
+    regist_fc(reshape_itensor, n_output, weight, bias);
  }
 };

--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@ -0,0 +1,83 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class SkipLayerNormOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+#if IS_TRT_VERSION_GE(6000)
+    VLOG(4) << "convert fused skip layernorm op to tensorrt layer";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    std::vector<nvinfer1::ITensor*> inputs;
+    inputs.push_back(input1);
+    inputs.push_back(input2);
+
+    auto get_persistable_data = [&](const std::string& arg_name,
+                                    framework::DDim* dims) -> float* {
+      std::string var_name = op_desc.Input(arg_name).front();
+      auto* temp_var = scope.FindVar(var_name);
+      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+      (*dims) = temp_tensor->dims();
+
+      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false);
+      return temp_data;
+    };
+
+    framework::DDim bias_dims, scale_dims;
+    auto* bias = get_persistable_data("Bias", &bias_dims);
+    auto* scale = get_persistable_data("Scale", &scale_dims);
+    float eps = boost::get<float>(op_desc.GetAttr("epsilon"));
+    int bias_size = framework::product(bias_dims);
+    int scale_size = framework::product(scale_dims);
+
+    nvinfer1::ILayer* layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+      bool ban_fp16 = engine_->disable_trt_plugin_fp16();
+      plugin::SkipLayerNormPluginDynamic* plugin =
+          new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
+                                                 scale_size, eps, ban_fp16);
+      layer = engine_->AddPluginV2(inputs.data(), 2, plugin);
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the Ernie(Bert) model in static"
+          "shape mode, which is not supported for the time being.\n"
+          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
+          " to set the shape information to run the dynamic shape mode."));
+    }
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "skip_layernorm", {output_name}, test_mode);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "You are running the TRT Dynamic Shape mode, need to confirm that "
+        "your TRT version is no less than 6.0"));
+#endif
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(skip_layernorm, SkipLayerNormOpConverter);
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@ -160,6 +160,16 @@ void TensorRTEngine::FreezeNetwork() {
          Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
    }
    infer_builder_config_->addOptimizationProfile(optim_profile_.get());
+    if (WithFp16()) {
+      infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
+      if (disable_trt_plugin_fp16()) {
+        LOG(INFO) << "NOTE: In order to achieve higher accuracy, you have "
+                     "disabled the fp16 mode of TRT Plugin,\n"
+                  << "you can reopen it with "
+                     "'config.SetDynamicShapeInfo(min_shape, max_shape, "
+                     "opt_shape, false /*disable_trt_plugin_fp16*/)'";
+      }
+    }
    infer_engine_.reset(infer_builder_->buildEngineWithConfig(
        *network(), *infer_builder_config_));
 #endif
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@ -124,6 +124,7 @@ class TensorRTEngine {
      const ShapeMapType min_input_shape = {},
      const ShapeMapType max_input_shape = {},
      const ShapeMapType optim_input_shape = {},
+      bool disable_trt_plugin_fp16 = false,
      nvinfer1::ILogger& logger = NaiveLogger::Global())
      : max_batch_(max_batch),
        max_workspace_(max_workspace),
@ -133,6 +134,7 @@ class TensorRTEngine {
        min_input_shape_(min_input_shape),
        max_input_shape_(max_input_shape),
        optim_input_shape_(optim_input_shape),
+        disable_trt_plugin_fp16_(disable_trt_plugin_fp16),
        logger_(logger) {
    if (min_input_shape_.size() != 0 && max_input_shape_.size() != 0 &&
        optim_input_shape_.size() != 0) {
@ -207,6 +209,13 @@ class TensorRTEngine {

  void SetRuntimeBatch(size_t batch_size);
  int GetRuntimeBatch();
+
+  bool WithFp16() {
+    bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
+    bool support_fp16 = infer_builder_->platformHasFastFp16();
+    return enable_fp16 && support_fp16;
+  }
+
  int GetDeviceId() { return device_id_; }
  nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                    int num_inputs, plugin::PluginTensorRT*);
@ -264,9 +273,18 @@ class TensorRTEngine {
  ShapeMapType min_input_shape() { return min_input_shape_; }
  ShapeMapType max_input_shape() { return max_input_shape_; }
  ShapeMapType optim_input_shape() { return optim_input_shape_; }
-
+  bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }
  bool with_dynamic_shape() { return with_dynamic_shape_; }

+#if IS_TRT_VERSION_GE(6000)
+  nvinfer1::IPluginV2Layer* AddPluginV2(nvinfer1::ITensor* const* inputs,
+                                        int num_inputs,
+                                        plugin::DynamicPluginTensorRT* plugin) {
+    owned_pluginv2_.emplace_back(plugin);
+    return network()->addPluginV2(inputs, num_inputs, *plugin);
+  }
+#endif
+
 private:
  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
  // ensure that the thread is associated with the correct device by calling
@ -289,6 +307,7 @@ class TensorRTEngine {
  ShapeMapType min_input_shape_;
  ShapeMapType max_input_shape_;
  ShapeMapType optim_input_shape_;
+  bool disable_trt_plugin_fp16_{false};
  nvinfer1::ILogger& logger_;

  // max data size for the buffers.
@ -322,6 +341,7 @@ class TensorRTEngine {
 #if IS_TRT_VERSION_GE(6000)
  infer_ptr<nvinfer1::IBuilderConfig> infer_builder_config_;
  std::unique_ptr<nvinfer1::IOptimizationProfile> optim_profile_;
+  std::vector<std::unique_ptr<plugin::DynamicPluginTensorRT>> owned_pluginv2_;
 #endif
  std::mutex mutex_;
 };  // class TensorRTEngine
@ -358,10 +378,12 @@ class TRTEngineManager {
      const std::map<std::string, std::vector<int>> min_input_shape = {},
      const std::map<std::string, std::vector<int>> max_input_shape = {},
      const std::map<std::string, std::vector<int>> optim_input_shape = {},
+      bool disable_trt_plugin_fp16 = false,
      nvinfer1::ILogger& logger = NaiveLogger::Global()) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, precision,
-                                 calibrator, device_id, min_input_shape,
-                                 max_input_shape, optim_input_shape, logger);
+    auto* p =
+        new TensorRTEngine(max_batch, max_workspace, precision, calibrator,
+                           device_id, min_input_shape, max_input_shape,
+                           optim_input_shape, disable_trt_plugin_fp16, logger);
    engines_[name].reset(p);
    return p;
  }
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@ -23,6 +23,11 @@ struct SimpleOpTypeSetTeller : public Teller {
  SimpleOpTypeSetTeller() {
 #if IS_TRT_VERSION_GE(5130)
    teller_set.insert("relu6");
+#endif
+#if IS_TRT_VERSION_GE(6000)
+    teller_set.insert("fused_embedding_eltwise_layernorm");
+    teller_set.insert("multihead_matmul");
+    teller_set.insert("skip_layernorm");
 #endif
  }

@ -38,9 +43,11 @@ struct SimpleOpTypeSetTeller : public Teller {
 private:
  // use this set for no calib int8.
  std::unordered_set<std::string> int8_teller_set{
-      {"mul", "conv2d", "pool2d", "relu", "depthwise_conv2d", "softmax",
-       "batch_norm", "elementwise_add", "leaky_relu", "fc"}};
-  std::unordered_set<std::string> teller_set{{
+      "mul",        "conv2d",           "pool2d",
+      "relu",       "depthwise_conv2d", "softmax",
+      "batch_norm", "elementwise_add",  "leaky_relu",
+      "fc"};
+  std::unordered_set<std::string> teller_set{
      "mul",
      "conv2d",
      "pool2d",
@ -65,8 +72,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      "instance_norm",
      "gelu",
      "layer_norm",
-      "multihead_matmul",
-  }};
+  };
 };

 bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc,
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@ -1,5 +1,7 @@
 nv_library(tensorrt_plugin
           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
-           prelu_op_plugin.cu  trt_plugin_factory.cc gelu_op_plugin.cu
-           pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu instance_norm_op_plugin.cu
-           DEPS enforce tensorrt_engine prelu tensor)
+           prelu_op_plugin.cu  trt_plugin_factory.cc gelu_op_plugin.cu 
+           pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu
+instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
+qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu
+           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) 
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@ -0,0 +1,180 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <cassert>
+#include <cub/cub.cuh>  // NOLINT
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+#include "paddle/fluid/operators/math/bert_encoder_functor.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+// Dynamic Plugin below.
+#if IS_TRT_VERSION_GE(6000)
+
+int EmbEltwiseLayernormPluginDynamic::initialize() {
+  embs_gpu_.reserve(embs_.size());
+  for (int i = 0; i < embs_.size(); i++) {
+    cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
+    cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
+               cudaMemcpyHostToDevice);
+  }
+
+  cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
+  cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
+             cudaMemcpyHostToDevice);
+  cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
+  cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
+             cudaMemcpyHostToDevice);
+
+  return 0;
+}
+
+size_t EmbEltwiseLayernormPluginDynamic::getSerializationSize() const {
+  return 0;
+}
+
+void EmbEltwiseLayernormPluginDynamic::serialize(void *buffer) const {}
+
+nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
+    nvinfer1::IExprBuilder &expr_builder) {
+  PADDLE_ENFORCE_EQ(output_index, 0,
+                    platform::errors::InvalidArgument(
+                        "There is only one output of the EmbEltwiseLayernorm, "
+                        "so the index should be zero,"
+                        "but it's (%d)",
+                        output_index));
+  PADDLE_ENFORCE_EQ(
+      nb_inputs, 3,
+      platform::errors::InvalidArgument(
+          "The Input of the EmbEltwiseLayernorm should be 3, but we found "
+          "it has (%d) inputs",
+          nb_inputs));
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 5;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[0].d[1];
+  ret.d[2] = expr_builder.constant(hidden_size_);
+  ret.d[3] = expr_builder.constant(1);
+  ret.d[4] = expr_builder.constant(1);
+  return ret;
+}
+
+bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of swish plugin shoule not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  (in_out && pos < (nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc &desc = in_out[pos];
+  if (desc.format != nvinfer1::TensorFormat::kLINEAR) {
+    return false;
+  }
+
+  if (pos == 0) {
+    return desc.type == nvinfer1::DataType::kINT32;
+  }
+
+  const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
+  if (pos == 1 || pos == 2) {
+    return desc.type == nvinfer1::DataType::kINT32 &&
+           desc.dims.d[0] == prev.dims.d[0] && desc.dims.d[1] == prev.dims.d[1];
+  }
+
+  if (pos == 3) {
+    return desc.type == nvinfer1::DataType::kFLOAT;
+  }
+}
+
+nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+  PADDLE_ENFORCE_EQ(
+      index, 0, platform::errors::InvalidArgument(
+                    "The EmbEltwiseLayernorm Plugin only has one input, so the "
+                    "index value should be 0, but get %d.",
+                    index));
+  return nvinfer1::DataType::kFLOAT;
+}
+
+int EmbEltwiseLayernormPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *input_desc,
+    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
+    void *const *outputs, void *workspace, cudaStream_t stream) {
+  auto id_dims = input_desc[0].dims;
+  int batch = id_dims.d[0];
+  int seq_len = id_dims.d[1];
+  int input_num = embs_.size();
+
+  framework::Tensor in_ptr_tensor, emb_ptr_tensor;
+  int device_id;
+  cudaGetDevice(&device_id);
+
+  in_ptr_tensor.Resize({input_num});
+  emb_ptr_tensor.Resize({input_num});
+  int64_t *in_ptr_gpu_d =
+      in_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
+  int64_t *emb_ptr_gpu_d =
+      emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
+
+  std::vector<int64_t> in_ptr, emb_ptr;
+  for (int i = 0; i < input_num; i++) {
+    in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
+    emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
+  }
+
+  cudaMemcpyAsync(in_ptr_gpu_d, in_ptr.data(), sizeof(int64_t) * input_num,
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(emb_ptr_gpu_d, emb_ptr.data(), sizeof(int64_t) * input_num,
+                  cudaMemcpyHostToDevice, stream);
+
+  auto out_type = output_desc[0].type;
+
+  const unsigned tpb = 256;
+  const dim3 grid(seq_len, batch, 1);
+  const dim3 block(tpb, 1, 1);
+  PADDLE_ENFORCE_EQ(
+      out_type == nvinfer1::DataType::kFLOAT, true,
+      platform::errors::InvalidArgument(
+          "The EmbEltwiseLayernorm Plugin only only support fp32 input."));
+
+  float *output_d = static_cast<float *>(outputs[0]);
+  operators::math::EmbEltwiseLayerNormFunctor<float> emb_eltwise_layernorm_func;
+  emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
+                             scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
+                             eps_, input_num, stream);
+  return cudaGetLastError() != cudaSuccess;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@ -0,0 +1,113 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs,
+                                            float* bias, float* scale,
+                                            std::vector<int> emb_sizes,
+                                            int bias_size, int scale_size,
+                                            int hidden_size, float eps)
+      : embs_(input_embs),
+        bias_(bias),
+        scale_(scale),
+        emb_sizes_(emb_sizes),
+        bias_size_(bias_size),
+        scale_size_(scale_size),
+        hidden_size_(hidden_size),
+        eps_(eps) {}
+
+  EmbEltwiseLayernormPluginDynamic(void const* serialData,
+                                   size_t serialLength) {}
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new EmbEltwiseLayernormPluginDynamic(
+        embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
+        eps_);
+  }
+
+  const char* getPluginType() const override {
+    return "fused_embedding_eltwise_layernorm_plugin";
+  }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override;
+
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  void destroy() override { delete this; }
+
+ private:
+  std::vector<float*> embs_;
+  float* bias_;
+  float* scale_;
+
+  // data on devices
+  float* bias_gpu_;
+  float* scale_gpu_;
+  std::vector<float*> embs_gpu_;
+
+  std::vector<int> emb_sizes_;
+  int bias_size_;
+  int scale_size_;
+  int hidden_size_;
+  float eps_;
+};
+#endif
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
@ -0,0 +1,95 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit QkvToContextPluginDynamic(int hidden, int head_number, int head_size,
+                                     float scale, bool ban_fp16)
+      : hidden_(hidden),
+        head_number_(head_number),
+        head_size_(head_size),
+        scale_(scale),
+        ban_fp16_(ban_fp16) {}
+
+  QkvToContextPluginDynamic(void const* serialData, size_t serialLength) {}
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_,
+                                         scale_, ban_fp16_);
+  }
+
+  const char* getPluginType() const override { return "qkv_to_context_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override;
+
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  void destroy() override { delete this; }
+
+ private:
+  int hidden_;
+  int head_number_;
+  int head_size_;
+  float scale_;
+  bool ban_fp16_;
+};
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@ -0,0 +1,150 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <cassert>
+#include <cub/cub.cuh>  // NOLINT
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+#include "paddle/fluid/operators/math/bert_encoder_functor.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+// Dynamic Plugin below.
+#if IS_TRT_VERSION_GE(6000)
+
+int SkipLayerNormPluginDynamic::initialize() {
+  cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
+  cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
+             cudaMemcpyHostToDevice);
+  cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
+  cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
+             cudaMemcpyHostToDevice);
+  return 0;
+}
+
+size_t SkipLayerNormPluginDynamic::getSerializationSize() const { return 0; }
+
+void SkipLayerNormPluginDynamic::serialize(void *buffer) const {}
+
+nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
+    nvinfer1::IExprBuilder &expr_builder) {
+  PADDLE_ENFORCE_EQ(
+      inputs[0].nbDims, 5,
+      platform::errors::InvalidArgument(
+          "The Input dim of the SkipLayernorm should be 5, but it's (%d) now.",
+          inputs[0].nbDims));
+  return inputs[0];
+}
+
+bool SkipLayerNormPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of swish plugin shoule not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc &in = in_out[pos];
+  if (pos == 0) {
+#ifdef SUPPORTS_CUDA_FP16
+    if (ban_fp16_) {
+      return (in.type == nvinfer1::DataType::kFLOAT) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    } else {
+      return (in.type == nvinfer1::DataType::kFLOAT ||
+              in.type == nvinfer1::DataType::kHALF) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    }
+#else
+    return (in.type == nvinfer1::DataType::kFLOAT) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+#endif
+  }
+  const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
+
+  if (pos == 1) {
+    return in.type == prev.type && in.format == prev.format;
+  }
+
+  // output
+  return in.type == prev.type && in.format == prev.format;
+}
+
+nvinfer1::DataType SkipLayerNormPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The SkipLayerNorm Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
+  PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT ||
+                     input_types[0] == nvinfer1::DataType::kHALF),
+                    true, platform::errors::InvalidArgument(
+                              "The input type should be half or float"));
+  return input_types[0];
+}
+
+int SkipLayerNormPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *input_desc,
+    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
+    void *const *outputs, void *workspace, cudaStream_t stream) {
+  auto input_dims = input_desc[0].dims;
+  size_t num = ProductDim(input_dims);
+  int hidden = input_dims.d[2];
+
+  auto input_type = input_desc[0].type;
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    const float *input1 = static_cast<const float *>(inputs[0]);
+    const float *input2 = static_cast<const float *>(inputs[1]);
+    float *output = static_cast<float *>(outputs[0]);
+    operators::math::SkipLayerNormFunctor<float> skip_layer_norm_func;
+    skip_layer_norm_func(num, hidden, input1, input2, scale_gpu_, bias_gpu_,
+                         output, eps_, stream);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+#ifdef SUPPORTS_CUDA_FP16
+    const half *input1 = static_cast<const half *>(inputs[0]);
+    const half *input2 = static_cast<const half *>(inputs[1]);
+    half *output = static_cast<half *>(outputs[0]);
+    operators::math::SkipLayerNormFunctor<half> skip_layer_norm_func;
+    skip_layer_norm_func(num, hidden, input1, input2, scale_gpu_, bias_gpu_,
+                         output, static_cast<half>(eps_), stream);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The cuda archs you specific should greater than 600."));
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "The SkipLayerNorm TRT Plugin's input type should be float or half."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@ -0,0 +1,102 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit SkipLayerNormPluginDynamic(float* bias, float* scale, int bias_size,
+                                      int scale_size, const float eps,
+                                      bool ban_fp16)
+      : bias_(bias),
+        scale_(scale),
+        bias_size_(bias_size),
+        scale_size_(scale_size),
+        eps_(eps),
+        ban_fp16_(ban_fp16) {}
+  SkipLayerNormPluginDynamic(void const* serialData, size_t serialLength) {}
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new SkipLayerNormPluginDynamic(bias_, scale_, bias_size_,
+                                          scale_size_, eps_, ban_fp16_);
+  }
+
+  const char* getPluginType() const override { return "skip_layernorm_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override;
+
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  void destroy() override { delete this; }
+
+ private:
+  float* bias_;
+  float* scale_;
+
+  float* bias_gpu_;
+  float* scale_gpu_;
+
+  int bias_size_;
+  int scale_size_;
+
+  float eps_;
+  bool ban_fp16_;
+};
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@ -16,10 +16,12 @@

 #include <NvInfer.h>
 #include <cstring>
+#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>

+#include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
@ -112,6 +114,72 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
  std::vector<nvinfer1::ITensor*> inputs_;
 };

+#if IS_TRT_VERSION_GE(6000)
+class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  DynamicPluginTensorRT() {}
+  DynamicPluginTensorRT(const void* serialized_data, size_t length) {}
+
+  // The Func in IPluginExt or IpluginExtV2
+  virtual const char* getPluginVersion() const { return "1"; }
+  virtual const char* getPluginType() const = 0;
+  int getNbOutputs() const { return 1; }
+  int initialize() override { return 0; }
+  void terminate() override{};
+
+  virtual size_t getSerializationSize() const = 0;
+  virtual void serialize(void* buffer) const = 0;
+
+  // The Func in IPluginV2
+  nvinfer1::IPluginV2DynamicExt* clone() const = 0;
+  virtual nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) = 0;  // NOLINT
+
+  virtual bool supportsFormatCombination(
+      int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
+      int nb_outputs) = 0;
+
+  virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                               int nb_inputs,
+                               const nvinfer1::DynamicPluginTensorDesc* out,
+                               int nb_outputs) = 0;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nb_inputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nb_outputs) const override {
+    return 0;
+  }
+
+  virtual int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+                      const nvinfer1::PluginTensorDesc* output_desc,
+                      const void* const* inputs, void* const* outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+
+  virtual nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const = 0;
+  void setPluginNamespace(const char* plugin_namespace) override {
+    name_space_ = plugin_namespace;
+  }
+  const char* getPluginNamespace() const override {
+    return name_space_.c_str();
+  }
+  virtual void destroy() = 0;
+
+ protected:
+  void deserializeBase(void const*& serial_data,  // NOLINT
+                       size_t& serial_length);    // NOLINT
+  size_t getBaseSerializationSize() const;
+  void serializeBase(void*& buffer) const;  // NOLINT
+
+ private:
+  std::string name_space_{"paddle_trt"};
+  std::string plugin_base_{"plugin_dynamic"};
+};
+#endif
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@ -349,9 +349,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(trt_resnext_test SRCS trt_resnext_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(trt_bert_test SRCS trt_bert_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${BERT_INSTALL_DIR}/model)
    inference_analysis_test(trt_fc_prelu_test SRCS trt_fc_prelu_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@ -367,6 +364,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+     
    set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant_small_model")
    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
        inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "quant_small_model.tar.gz")
@ -382,6 +380,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
            ARGS --infer_model=${TEST_TRT_DYNAMIC_MODEL})
+
+    set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test")
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL})
+        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz")
+    endif()
+
+    inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
 endif()

 set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
--- a/Show More
+++ b/Show More