[Paddle-TRT]: Ernie Dynamic shape support. (#23138)

* add dynamic plugin support. test=develop * change emb eltwise layernorm to math function test=develop * add emb eltwise layernorm test=develop * can run dynamic shape ernie test=develop * fix ci test=develop * add ut for trt ernie dynamic test=develop * refine dynamic shape c++ interface. test=develop * fix comments test=develop * fix comments test=develop
5 years ago · 430b0099c9
parent d0413e58d3
commit 430b0099c9
37 changed files with 2298 additions and 765 deletions
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -101,8 +101,10 @@ function(select_nvcc_arch_flags out_variable)
  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
    set(cuda_arch_bin "60 61")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
    add_definitions("-DSUPPORTS_CUDA_FP16")
    set(cuda_arch_bin "70")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
    add_definitions("-DSUPPORTS_CUDA_FP16")
    set(cuda_arch_bin "75")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@ -176,10 +176,14 @@ struct Argument {
  DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
-  // usually use for trt dynamic shape.
+  // Usually use for trt dynamic shape.
  // TRT will select the best kernel according to opt shape
  // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
  // run fp16.
  DECL_ARGUMENT_FIELD(min_input_shape, MinInputShape, input_shape_t);
  DECL_ARGUMENT_FIELD(max_input_shape, MaxInputShape, input_shape_t);
  DECL_ARGUMENT_FIELD(optim_input_shape, OptimInputShape, input_shape_t);
  DECL_ARGUMENT_FIELD(disable_trt_plugin_fp16, CloseTrtPluginFp16, bool);
  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@ -130,6 +130,11 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("optim_input_shape",
                new std::map<std::string, std::vector<int>>(
                    argument->optim_input_shape()));
      // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
      // not
      // run fp16.
      pass->Set("disable_trt_plugin_fp16",
                new bool(argument->disable_trt_plugin_fp16()));
    }
    if (pass_name == "ngraph_subgraph_pass") {
      pass->Set("program",
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@ -272,7 +272,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  // Check trt version for dynamic shape input.
  if (min_input_shape.size() > 0 && TRT_VERSION < 6000) {
    std::cout << "hello";
    LOG_FIRST_N(WARNING, 1) << "You are using the dynamic size input mode of "
                               "Paddle-TRT, but we found that the version of "
                               "the TensorRT is less than 6.0, so we use the "
@ -284,18 +283,23 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  if (min_input_shape.size() > 0 && TRT_VERSION > 6000) {
    LOG_FIRST_N(WARNING, 1)
-        << "The Paddle lib links the " << TRT_VERSION / 1000.
+        << "The Paddle lib links the " << TRT_VERSION << " version TensorRT, "
        << " version TensorRT, "
        << "make sure the runtime TensorRT you are using is no less than this "
           "version, otherwise, there might be Segfault!";
  }
  // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
  // run fp16.
  // When running fp16, the output accuracy of the model will be affected,
  // closing the plugin fp16 may bring some improvement on accuracy.
  bool disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
  tensorrt::TensorRTEngine *trt_engine =
      inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
          .Create(engine_key + std::to_string(predictor_id),
                  Get<int>("max_batch_size"), Get<int>("workspace_size"),
                  precision_mode, calibrator.get(), Get<int>("gpu_device_id"),
-                  min_input_shape, max_input_shape, opt_input_shape);
+                  min_input_shape, max_input_shape, opt_input_shape,
                  disable_trt_plugin_fp16);
  bool need_serialize = (use_static_engine && !load_from_memory);
  if (need_serialize) {
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@ -128,6 +128,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(min_input_shape_);
  CP_MEMBER(max_input_shape_);
  CP_MEMBER(optim_input_shape_);
  CP_MEMBER(disable_trt_plugin_fp16_);
  CP_MEMBER(use_lite_);
  CP_MEMBER(lite_precision_mode_);
@ -226,10 +227,7 @@ MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
 void AnalysisConfig::EnableTensorRtEngine(
    int workspace_size, int max_batch_size, int min_subgraph_size,
    AnalysisConfig::Precision precision_mode, bool use_static,
-    bool use_calib_mode,
+    bool use_calib_mode) {
    std::map<std::string, std::vector<int>> min_input_shape,
    std::map<std::string, std::vector<int>> max_input_shape,
    std::map<std::string, std::vector<int>> optim_input_shape) {
 #ifdef PADDLE_WITH_CUDA
  if (!use_gpu()) {
    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@ -243,9 +241,6 @@ void AnalysisConfig::EnableTensorRtEngine(
  tensorrt_precision_mode_ = precision_mode;
  trt_use_static_engine_ = use_static;
  trt_use_calib_mode_ = use_calib_mode;
  min_input_shape_ = min_input_shape;
  max_input_shape_ = max_input_shape;
  optim_input_shape_ = optim_input_shape;
  Update();
 #else
@ -254,6 +249,17 @@ void AnalysisConfig::EnableTensorRtEngine(
 #endif
 }
 void AnalysisConfig::SetTRTDynamicShapeInfo(
    std::map<std::string, std::vector<int>> min_input_shape,
    std::map<std::string, std::vector<int>> max_input_shape,
    std::map<std::string, std::vector<int>> optim_input_shape,
    bool disable_trt_plugin_fp16) {
  min_input_shape_ = min_input_shape;
  max_input_shape_ = max_input_shape;
  optim_input_shape_ = optim_input_shape;
  disable_trt_plugin_fp16_ = disable_trt_plugin_fp16;
 }
 // TODO(Superjomn) refactor this, buggy.
 void AnalysisConfig::Update() {
  auto info = SerializeInfoCache();
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@ -428,6 +428,7 @@ void AnalysisPredictor::PrepareArgument() {
    argument_.SetMinInputShape(config_.min_input_shape_);
    argument_.SetMaxInputShape(config_.max_input_shape_);
    argument_.SetOptimInputShape(config_.optim_input_shape_);
    argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
  }
  if (config_.lite_engine_enabled()) {
@ -951,4 +952,6 @@ USE_TRT_CONVERTER(instance_norm);
 USE_TRT_CONVERTER(layer_norm);
 USE_TRT_CONVERTER(gelu);
 USE_TRT_CONVERTER(multihead_matmul);
 USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
 USE_TRT_CONVERTER(skip_layernorm);
 #endif
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@ -222,16 +222,29 @@ struct AnalysisConfig {
   * @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a
   * subgraph is less than this, it will not transfer to TensorRT engine.
   */
-  void EnableTensorRtEngine(
+  void EnableTensorRtEngine(int workspace_size = 1 << 20,
-      int workspace_size = 1 << 20, int max_batch_size = 1,
+                            int max_batch_size = 1, int min_subgraph_size = 3,
-      int min_subgraph_size = 3, Precision precision = Precision::kFloat32,
+                            Precision precision = Precision::kFloat32,
-      bool use_static = false, bool use_calib_mode = true,
+                            bool use_static = false,
-      std::map<std::string, std::vector<int>> min_input_shape = {},
+                            bool use_calib_mode = true);
-      std::map<std::string, std::vector<int>> max_input_shape = {},
+
      std::map<std::string, std::vector<int>> optim_input_shape = {});
  /** A boolean state telling whether the TensorRT engine is used.
   */
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
  /**
   *  \brief Set min, max, opt shape for TensorRT Dynamic shape mode.
   *  @param min_input_shape the min input shape of the subgraph input
   *  @param max_input_shape the max input shape of the subgraph input
   *  @param opt_input_shape the opt input shape of the subgraph input
   *  @param disable_trt_plugin_fp16, setting this variable to true
   *  means that TRT plugin will not run fp16
   */
  void SetTRTDynamicShapeInfo(
      std::map<std::string, std::vector<int>> min_input_shape,
      std::map<std::string, std::vector<int>> max_input_shape,
      std::map<std::string, std::vector<int>> optim_input_shape,
      bool disable_trt_plugin_fp16 = false);
  /**
   *  \brief Turn on the usage of Lite sub-graph engine.
   */
@ -386,6 +399,10 @@ struct AnalysisConfig {
  Precision tensorrt_precision_mode_;
  bool trt_use_static_engine_;
  bool trt_use_calib_mode_;
  std::map<std::string, std::vector<int>> min_input_shape_{};
  std::map<std::string, std::vector<int>> max_input_shape_{};
  std::map<std::string, std::vector<int>> optim_input_shape_{};
  bool disable_trt_plugin_fp16_{false};
  // memory reuse related.
  bool enable_memory_optim_{false};
@ -412,9 +429,6 @@ struct AnalysisConfig {
  std::string serialized_info_cache_;
  mutable std::unique_ptr<PassStrategy> pass_builder_;
  std::map<std::string, std::vector<int>> min_input_shape_;
  std::map<std::string, std::vector<int>> max_input_shape_;
  std::map<std::string, std::vector<int>> optim_input_shape_;
  bool use_lite_{false};
  std::vector<std::string> lite_passes_filter_;
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@ -78,7 +78,9 @@ const std::vector<std::string> kTRTSubgraphPasses({
      "delete_quant_dequant_op_pass",              //
      // "fc_fuse_pass",                                 //
      "simplify_with_basic_ops_pass",           //
-      "multihead_matmul_fuse_pass",    //
+      "embedding_eltwise_layernorm_fuse_pass",  //
      "multihead_matmul_fuse_pass_v2",          //
      "skip_layernorm_fuse_pass",               //
      "conv_bn_fuse_pass",                      //
      "fc_fuse_pass",                           //
      "tensorrt_subgraph_pass",                 //
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@ -3,7 +3,7 @@ nv_library(tensorrt_converter
           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc
+                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc
           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@ -0,0 +1,115 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 class EmbEltwiseLayerNormOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
 #if IS_TRT_VERSION_GE(6000)
    VLOG(4) << "convert fluid swish op to tensorrt layer";
    framework::OpDesc op_desc(op, nullptr);
    auto id_names = op_desc.Input("Ids");
    auto emb_names = op_desc.Input("Embs");
    PADDLE_ENFORCE_EQ(id_names.size(), emb_names.size(),
                      platform::errors::InvalidArgument(
                          "The id and emb size of fused EmbEltwiseLayerNormOp "
                          "should be same "));
    int input_num = id_names.size();
    // Declare inputs
    std::vector<nvinfer1::ITensor*> input_ids;
    for (int i = 0; i < input_num; i++) {
      input_ids.push_back(engine_->GetITensor(id_names[i]));
    }
    std::vector<float*> input_embs;
    std::vector<int> emb_sizes;
    // get the presistable var's data
    auto get_persistable_data = [&](const std::string& var_name,
                                    framework::DDim* dims) -> float* {
      auto* temp_var = scope.FindVar(var_name);
      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
      (*dims) = temp_tensor->dims();
      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false);
      return temp_data;
    };
    int hidden = 0;
    for (int i = 0; i < input_num; i++) {
      framework::DDim emb_dims;
      float* emb_data = get_persistable_data(emb_names[i], &emb_dims);
      int64_t emb_size = framework::product(emb_dims);
      input_embs.push_back(emb_data);
      emb_sizes.push_back(emb_size);
      PADDLE_ENFORCE_EQ(
          emb_dims.size(), 2,
          platform::errors::InvalidArgument(
              "The fused EmbEltwiseLayerNorm's emb should be 2 dims."));
      hidden = emb_dims[1];
    }
    framework::DDim bias_dims, scale_dims;
    auto* bias =
        get_persistable_data(op_desc.Input("Bias").front(), &bias_dims);
    auto* scale =
        get_persistable_data(op_desc.Input("Scale").front(), &scale_dims);
    int64_t bias_size = framework::product(bias_dims);
    int64_t scale_size = framework::product(scale_dims);
    float eps = boost::get<float>(op_desc.GetAttr("epsilon"));
    nvinfer1::ILayer* layer = nullptr;
    if (engine_->with_dynamic_shape()) {
      plugin::EmbEltwiseLayernormPluginDynamic* plugin =
          new plugin::EmbEltwiseLayernormPluginDynamic(input_embs, bias, scale,
                                                       emb_sizes, bias_size,
                                                       scale_size, hidden, eps);
      layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
    } else {
      PADDLE_THROW(platform::errors::Fatal(
          "You are running the Ernie(Bert) model in static"
          "shape mode, which is not supported for the time being.\n"
          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
          " to set the shape information to run the dynamic shape mode."));
    }
    auto output_name = op_desc.Output("Out")[0];
    RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
                             test_mode);
 #else
    PADDLE_THROW(platform::errors::Fatal(
        "You are running the TRT Dynamic Shape mode, need to confirm that "
        "your TRT version is no less than 6.0"));
 #endif
  }
 };
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
 REGISTER_TRT_OP_CONVERTER(fused_embedding_eltwise_layernorm,
                          EmbEltwiseLayerNormOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@ -18,32 +18,6 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 // Reorder the elements from istrides to ostrides, borrowed from TRT convert in
 // tensorflow.
 // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
 template <typename T>
 void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
              T* odata, nvinfer1::DimsHW ostrides) {
  for (int h = 0; h < shape.h(); ++h) {
    for (int w = 0; w < shape.w(); ++w) {
      odata[h * ostrides.h() + w * ostrides.w()] =
          idata[h * istrides.h() + w * istrides.w()];
    }
  }
 }
 // indata c * k
 // Reorder the data layout from CK to KC.
 void ReorderCKtoKC(TensorRTEngine::Weight& iweights,  // NOLINT
                   TensorRTEngine::Weight* oweights) {
  int c = iweights.dims[0];
  int k = iweights.dims[1];
  oweights->dims.assign({k, c});
  nvinfer1::DimsHW istrides = {1, k};
  nvinfer1::DimsHW ostrides = {c, 1};
  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
           static_cast<float*>(const_cast<void*>(oweights->get().values)),
           ostrides);
 }
 /*
 * FC converter convert a MUL op in Fluid to a FC layer in TRT.
 */
@ -64,7 +38,6 @@ class FcOpConverter : public OpConverter {
    }
    // Declare inputs
    auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
    // Declare weights
    auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
    PADDLE_ENFORCE_NOT_NULL(Y_v);
@ -101,28 +74,44 @@ class FcOpConverter : public OpConverter {
    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
    size_t n_output = Y_t->dims()[1];
-    std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor());
+    int m = Y_t->dims()[0];
-    tmp->Resize(Y_t->dims());
+    int n = Y_t->dims()[1];
    auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
      for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
          dst[j * m + i] = src[i * n + j];
        }
      }
    };
    auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
                         TensorRTEngine::Weight& weight,
                         TensorRTEngine::Weight& bias) {
      auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs,
                                            n_output, weight.get(), bias.get());
      auto output_name = op_desc.Output("Out").front();
      if (activation_type == "relu") {
        nvinfer1::IActivationLayer* relu_layer =
            TRT_ENGINE_ADD_LAYER(engine_, Activation, *(fc_layer->getOutput(0)),
                                 nvinfer1::ActivationType::kRELU);
        RreplenishLayerAndOutput(relu_layer, "fc", {output_name}, test_mode);
      } else {
        RreplenishLayerAndOutput(fc_layer, "fc", {output_name}, test_mode);
      }
    };
    std::vector<float> weight_data_tmp;
    weight_data_tmp.reserve(Y_t->numel());
    memcpy(weight_data_tmp.data(), weight_data, Y_t->numel() * sizeof(float));
    tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
    memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data,
           Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                  static_cast<void*>(weight_data),
                                  static_cast<size_t>(Y_t->numel())};
-    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+    weight.dims.assign({n, m});
-                                      static_cast<void*>(tmp->data<float>()),
+
                                      static_cast<size_t>(Y_t->numel()));
    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
    tmp_weight.dims = weight.dims;
    // The data layout of TRT FC layer's weight is different from fluid's FC,
    // need to reorder the elements.
    ReorderCKtoKC(weight, &tmp_weight);
    // Currently, the framework can only handle one fluid op -> one TRT layer,
    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
    // handle `mul`, leave `add` as another layer.
    // DEBUG
    float* bias_data = nullptr;
    int bias_num = 0;
    if (with_bias) {
@ -136,6 +125,10 @@ class FcOpConverter : public OpConverter {
                                static_cast<void*>(bias_data),
                                static_cast<size_t>(bias_num)};
    if (engine_->with_dynamic_shape()) {
      regist_fc(X, n_output, weight, bias);
      return;
    }
    // in order to handle situations in NLP models(input dims < 3,
    // x_num_col_dims != 1, etc.), reshape input to perform FC correctly.
    auto* reshape_itensor = X;
@ -192,20 +185,7 @@ class FcOpConverter : public OpConverter {
      reshape_layer->setReshapeDimensions(reshape_dim);
      reshape_itensor = reshape_layer->getOutput(0);
    }
-    auto* fc_layer =
+    regist_fc(reshape_itensor, n_output, weight, bias);
        TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *reshape_itensor,
                             n_output, tmp_weight.get(), bias.get());
    engine_->SetWeights(op_desc.Input(w_name).front(), std::move(tmp));
    auto output_name = op_desc.Output("Out").front();
    if (activation_type == "relu") {
      nvinfer1::IActivationLayer* relu_layer =
          TRT_ENGINE_ADD_LAYER(engine_, Activation, *(fc_layer->getOutput(0)),
                               nvinfer1::ActivationType::kRELU);
      RreplenishLayerAndOutput(relu_layer, "fc", {output_name}, test_mode);
    } else {
      RreplenishLayerAndOutput(fc_layer, "fc", {output_name}, test_mode);
    }
  }
 };
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@ -0,0 +1,83 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 class SkipLayerNormOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
 #if IS_TRT_VERSION_GE(6000)
    VLOG(4) << "convert fused skip layernorm op to tensorrt layer";
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs
    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
    std::vector<nvinfer1::ITensor*> inputs;
    inputs.push_back(input1);
    inputs.push_back(input2);
    auto get_persistable_data = [&](const std::string& arg_name,
                                    framework::DDim* dims) -> float* {
      std::string var_name = op_desc.Input(arg_name).front();
      auto* temp_var = scope.FindVar(var_name);
      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
      (*dims) = temp_tensor->dims();
      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false);
      return temp_data;
    };
    framework::DDim bias_dims, scale_dims;
    auto* bias = get_persistable_data("Bias", &bias_dims);
    auto* scale = get_persistable_data("Scale", &scale_dims);
    float eps = boost::get<float>(op_desc.GetAttr("epsilon"));
    int bias_size = framework::product(bias_dims);
    int scale_size = framework::product(scale_dims);
    nvinfer1::ILayer* layer = nullptr;
    if (engine_->with_dynamic_shape()) {
      bool ban_fp16 = engine_->disable_trt_plugin_fp16();
      plugin::SkipLayerNormPluginDynamic* plugin =
          new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
                                                 scale_size, eps, ban_fp16);
      layer = engine_->AddPluginV2(inputs.data(), 2, plugin);
    } else {
      PADDLE_THROW(platform::errors::Fatal(
          "You are running the Ernie(Bert) model in static"
          "shape mode, which is not supported for the time being.\n"
          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
          " to set the shape information to run the dynamic shape mode."));
    }
    auto output_name = op_desc.Output("Out")[0];
    RreplenishLayerAndOutput(layer, "skip_layernorm", {output_name}, test_mode);
 #else
    PADDLE_THROW(platform::errors::Fatal(
        "You are running the TRT Dynamic Shape mode, need to confirm that "
        "your TRT version is no less than 6.0"));
 #endif
  }
 };
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
 REGISTER_TRT_OP_CONVERTER(skip_layernorm, SkipLayerNormOpConverter);
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@ -160,6 +160,16 @@ void TensorRTEngine::FreezeNetwork() {
          Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
    }
    infer_builder_config_->addOptimizationProfile(optim_profile_.get());
    if (WithFp16()) {
      infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
      if (disable_trt_plugin_fp16()) {
        LOG(INFO) << "NOTE: In order to achieve higher accuracy, you have "
                     "disabled the fp16 mode of TRT Plugin,\n"
                  << "you can reopen it with "
                     "'config.SetDynamicShapeInfo(min_shape, max_shape, "
                     "opt_shape, false /*disable_trt_plugin_fp16*/)'";
      }
    }
    infer_engine_.reset(infer_builder_->buildEngineWithConfig(
        *network(), *infer_builder_config_));
 #endif
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@ -124,6 +124,7 @@ class TensorRTEngine {
      const ShapeMapType min_input_shape = {},
      const ShapeMapType max_input_shape = {},
      const ShapeMapType optim_input_shape = {},
      bool disable_trt_plugin_fp16 = false,
      nvinfer1::ILogger& logger = NaiveLogger::Global())
      : max_batch_(max_batch),
        max_workspace_(max_workspace),
@ -133,6 +134,7 @@ class TensorRTEngine {
        min_input_shape_(min_input_shape),
        max_input_shape_(max_input_shape),
        optim_input_shape_(optim_input_shape),
        disable_trt_plugin_fp16_(disable_trt_plugin_fp16),
        logger_(logger) {
    if (min_input_shape_.size() != 0 && max_input_shape_.size() != 0 &&
        optim_input_shape_.size() != 0) {
@ -207,6 +209,13 @@ class TensorRTEngine {
  void SetRuntimeBatch(size_t batch_size);
  int GetRuntimeBatch();
  bool WithFp16() {
    bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
    bool support_fp16 = infer_builder_->platformHasFastFp16();
    return enable_fp16 && support_fp16;
  }
  int GetDeviceId() { return device_id_; }
  nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                    int num_inputs, plugin::PluginTensorRT*);
@ -264,9 +273,18 @@ class TensorRTEngine {
  ShapeMapType min_input_shape() { return min_input_shape_; }
  ShapeMapType max_input_shape() { return max_input_shape_; }
  ShapeMapType optim_input_shape() { return optim_input_shape_; }
-
+  bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }
  bool with_dynamic_shape() { return with_dynamic_shape_; }
 #if IS_TRT_VERSION_GE(6000)
  nvinfer1::IPluginV2Layer* AddPluginV2(nvinfer1::ITensor* const* inputs,
                                        int num_inputs,
                                        plugin::DynamicPluginTensorRT* plugin) {
    owned_pluginv2_.emplace_back(plugin);
    return network()->addPluginV2(inputs, num_inputs, *plugin);
  }
 #endif
 private:
  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
  // ensure that the thread is associated with the correct device by calling
@ -289,6 +307,7 @@ class TensorRTEngine {
  ShapeMapType min_input_shape_;
  ShapeMapType max_input_shape_;
  ShapeMapType optim_input_shape_;
  bool disable_trt_plugin_fp16_{false};
  nvinfer1::ILogger& logger_;
  // max data size for the buffers.
@ -322,6 +341,7 @@ class TensorRTEngine {
 #if IS_TRT_VERSION_GE(6000)
  infer_ptr<nvinfer1::IBuilderConfig> infer_builder_config_;
  std::unique_ptr<nvinfer1::IOptimizationProfile> optim_profile_;
  std::vector<std::unique_ptr<plugin::DynamicPluginTensorRT>> owned_pluginv2_;
 #endif
  std::mutex mutex_;
 };  // class TensorRTEngine
@ -358,10 +378,12 @@ class TRTEngineManager {
      const std::map<std::string, std::vector<int>> min_input_shape = {},
      const std::map<std::string, std::vector<int>> max_input_shape = {},
      const std::map<std::string, std::vector<int>> optim_input_shape = {},
      bool disable_trt_plugin_fp16 = false,
      nvinfer1::ILogger& logger = NaiveLogger::Global()) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, precision,
+    auto* p =
-                                 calibrator, device_id, min_input_shape,
+        new TensorRTEngine(max_batch, max_workspace, precision, calibrator,
-                                 max_input_shape, optim_input_shape, logger);
+                           device_id, min_input_shape, max_input_shape,
                           optim_input_shape, disable_trt_plugin_fp16, logger);
    engines_[name].reset(p);
    return p;
  }
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@ -23,6 +23,11 @@ struct SimpleOpTypeSetTeller : public Teller {
  SimpleOpTypeSetTeller() {
 #if IS_TRT_VERSION_GE(5130)
    teller_set.insert("relu6");
 #endif
 #if IS_TRT_VERSION_GE(6000)
    teller_set.insert("fused_embedding_eltwise_layernorm");
    teller_set.insert("multihead_matmul");
    teller_set.insert("skip_layernorm");
 #endif
  }
@ -38,9 +43,11 @@ struct SimpleOpTypeSetTeller : public Teller {
 private:
  // use this set for no calib int8.
  std::unordered_set<std::string> int8_teller_set{
-      {"mul", "conv2d", "pool2d", "relu", "depthwise_conv2d", "softmax",
+      "mul",        "conv2d",           "pool2d",
-       "batch_norm", "elementwise_add", "leaky_relu", "fc"}};
+      "relu",       "depthwise_conv2d", "softmax",
-  std::unordered_set<std::string> teller_set{{
+      "batch_norm", "elementwise_add",  "leaky_relu",
      "fc"};
  std::unordered_set<std::string> teller_set{
      "mul",
      "conv2d",
      "pool2d",
@ -65,8 +72,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      "instance_norm",
      "gelu",
      "layer_norm",
-      "multihead_matmul",
+  };
  }};
 };
 bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc,
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@ -1,5 +1,7 @@
 nv_library(tensorrt_plugin
           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
           prelu_op_plugin.cu  trt_plugin_factory.cc gelu_op_plugin.cu 
-           pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu instance_norm_op_plugin.cu
+           pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu
-           DEPS enforce tensorrt_engine prelu tensor)
+instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
 qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu
           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) 
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@ -0,0 +1,180 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <stdio.h>
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
 // Dynamic Plugin below.
 #if IS_TRT_VERSION_GE(6000)
 int EmbEltwiseLayernormPluginDynamic::initialize() {
  embs_gpu_.reserve(embs_.size());
  for (int i = 0; i < embs_.size(); i++) {
    cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
    cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
               cudaMemcpyHostToDevice);
  }
  cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
  cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
             cudaMemcpyHostToDevice);
  cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
  cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
             cudaMemcpyHostToDevice);
  return 0;
 }
 size_t EmbEltwiseLayernormPluginDynamic::getSerializationSize() const {
  return 0;
 }
 void EmbEltwiseLayernormPluginDynamic::serialize(void *buffer) const {}
 nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
    nvinfer1::IExprBuilder &expr_builder) {
  PADDLE_ENFORCE_EQ(output_index, 0,
                    platform::errors::InvalidArgument(
                        "There is only one output of the EmbEltwiseLayernorm, "
                        "so the index should be zero,"
                        "but it's (%d)",
                        output_index));
  PADDLE_ENFORCE_EQ(
      nb_inputs, 3,
      platform::errors::InvalidArgument(
          "The Input of the EmbEltwiseLayernorm should be 3, but we found "
          "it has (%d) inputs",
          nb_inputs));
  nvinfer1::DimsExprs ret;
  ret.nbDims = 5;
  ret.d[0] = inputs[0].d[0];
  ret.d[1] = inputs[0].d[1];
  ret.d[2] = expr_builder.constant(hidden_size_);
  ret.d[3] = expr_builder.constant(1);
  ret.d[4] = expr_builder.constant(1);
  return ret;
 }
 bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
    int nb_outputs) {
  PADDLE_ENFORCE_NOT_NULL(
      in_out, platform::errors::InvalidArgument(
                  "The input of swish plugin shoule not be nullptr."));
  PADDLE_ENFORCE_LT(
      pos, nb_inputs + nb_outputs,
      platform::errors::InvalidArgument("The pos(%d) should be less than the "
                                        "num(%d) of the input and the output.",
                                        pos, nb_inputs + nb_outputs));
  (in_out && pos < (nb_inputs + nb_outputs));
  const nvinfer1::PluginTensorDesc &desc = in_out[pos];
  if (desc.format != nvinfer1::TensorFormat::kLINEAR) {
    return false;
  }
  if (pos == 0) {
    return desc.type == nvinfer1::DataType::kINT32;
  }
  const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
  if (pos == 1 || pos == 2) {
    return desc.type == nvinfer1::DataType::kINT32 &&
           desc.dims.d[0] == prev.dims.d[0] && desc.dims.d[1] == prev.dims.d[1];
  }
  if (pos == 3) {
    return desc.type == nvinfer1::DataType::kFLOAT;
  }
 }
 nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
  PADDLE_ENFORCE_EQ(
      index, 0, platform::errors::InvalidArgument(
                    "The EmbEltwiseLayernorm Plugin only has one input, so the "
                    "index value should be 0, but get %d.",
                    index));
  return nvinfer1::DataType::kFLOAT;
 }
 int EmbEltwiseLayernormPluginDynamic::enqueue(
    const nvinfer1::PluginTensorDesc *input_desc,
    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
    void *const *outputs, void *workspace, cudaStream_t stream) {
  auto id_dims = input_desc[0].dims;
  int batch = id_dims.d[0];
  int seq_len = id_dims.d[1];
  int input_num = embs_.size();
  framework::Tensor in_ptr_tensor, emb_ptr_tensor;
  int device_id;
  cudaGetDevice(&device_id);
  in_ptr_tensor.Resize({input_num});
  emb_ptr_tensor.Resize({input_num});
  int64_t *in_ptr_gpu_d =
      in_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
  int64_t *emb_ptr_gpu_d =
      emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
  std::vector<int64_t> in_ptr, emb_ptr;
  for (int i = 0; i < input_num; i++) {
    in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
    emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
  }
  cudaMemcpyAsync(in_ptr_gpu_d, in_ptr.data(), sizeof(int64_t) * input_num,
                  cudaMemcpyHostToDevice, stream);
  cudaMemcpyAsync(emb_ptr_gpu_d, emb_ptr.data(), sizeof(int64_t) * input_num,
                  cudaMemcpyHostToDevice, stream);
  auto out_type = output_desc[0].type;
  const unsigned tpb = 256;
  const dim3 grid(seq_len, batch, 1);
  const dim3 block(tpb, 1, 1);
  PADDLE_ENFORCE_EQ(
      out_type == nvinfer1::DataType::kFLOAT, true,
      platform::errors::InvalidArgument(
          "The EmbEltwiseLayernorm Plugin only only support fp32 input."));
  float *output_d = static_cast<float *>(outputs[0]);
  operators::math::EmbEltwiseLayerNormFunctor<float> emb_eltwise_layernorm_func;
  emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
                             scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
                             eps_, input_num, stream);
  return cudaGetLastError() != cudaSuccess;
 }
 #endif
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@ -0,0 +1,113 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
 #if IS_TRT_VERSION_GE(6000)
 class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
 public:
  explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs,
                                            float* bias, float* scale,
                                            std::vector<int> emb_sizes,
                                            int bias_size, int scale_size,
                                            int hidden_size, float eps)
      : embs_(input_embs),
        bias_(bias),
        scale_(scale),
        emb_sizes_(emb_sizes),
        bias_size_(bias_size),
        scale_size_(scale_size),
        hidden_size_(hidden_size),
        eps_(eps) {}
  EmbEltwiseLayernormPluginDynamic(void const* serialData,
                                   size_t serialLength) {}
  nvinfer1::IPluginV2DynamicExt* clone() const override {
    return new EmbEltwiseLayernormPluginDynamic(
        embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
        eps_);
  }
  const char* getPluginType() const override {
    return "fused_embedding_eltwise_layernorm_plugin";
  }
  int getNbOutputs() const override { return 1; }
  int initialize() override;
  size_t getSerializationSize() const override;
  void serialize(void* buffer) const override;
  nvinfer1::DimsExprs getOutputDimensions(
      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
      nvinfer1::IExprBuilder& expr_builder) override;
  bool supportsFormatCombination(int pos,
                                 const nvinfer1::PluginTensorDesc* inOut,
                                 int nbInputs, int nbOutputs) override;
  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                       int nbInputs,
                       const nvinfer1::DynamicPluginTensorDesc* out,
                       int nbOutputs) override {}
  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                          int nbInputs,
                          const nvinfer1::PluginTensorDesc* outputs,
                          int nbOutputs) const override {
    return 0;
  }
  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
              const nvinfer1::PluginTensorDesc* outputDesc,
              const void* const* inputs, void* const* outputs, void* workspace,
              cudaStream_t stream) override;
  nvinfer1::DataType getOutputDataType(int index,
                                       const nvinfer1::DataType* inputTypes,
                                       int nbInputs) const override;
  void destroy() override { delete this; }
 private:
  std::vector<float*> embs_;
  float* bias_;
  float* scale_;
  // data on devices
  float* bias_gpu_;
  float* scale_gpu_;
  std::vector<float*> embs_gpu_;
  std::vector<int> emb_sizes_;
  int bias_size_;
  int scale_size_;
  int hidden_size_;
  float eps_;
 };
 #endif
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
@ -0,0 +1,95 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
 #if IS_TRT_VERSION_GE(6000)
 class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
 public:
  explicit QkvToContextPluginDynamic(int hidden, int head_number, int head_size,
                                     float scale, bool ban_fp16)
      : hidden_(hidden),
        head_number_(head_number),
        head_size_(head_size),
        scale_(scale),
        ban_fp16_(ban_fp16) {}
  QkvToContextPluginDynamic(void const* serialData, size_t serialLength) {}
  nvinfer1::IPluginV2DynamicExt* clone() const override {
    return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_,
                                         scale_, ban_fp16_);
  }
  const char* getPluginType() const override { return "qkv_to_context_plugin"; }
  int getNbOutputs() const override { return 1; }
  int initialize() override;
  size_t getSerializationSize() const override;
  void serialize(void* buffer) const override;
  nvinfer1::DimsExprs getOutputDimensions(
      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
      nvinfer1::IExprBuilder& expr_builder) override;
  bool supportsFormatCombination(int pos,
                                 const nvinfer1::PluginTensorDesc* inOut,
                                 int nbInputs, int nbOutputs) override;
  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                       int nbInputs,
                       const nvinfer1::DynamicPluginTensorDesc* out,
                       int nbOutputs) override {}
  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                          int nbInputs,
                          const nvinfer1::PluginTensorDesc* outputs,
                          int nbOutputs) const override {
    return 0;
  }
  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
              const nvinfer1::PluginTensorDesc* outputDesc,
              const void* const* inputs, void* const* outputs, void* workspace,
              cudaStream_t stream) override;
  nvinfer1::DataType getOutputDataType(int index,
                                       const nvinfer1::DataType* inputTypes,
                                       int nbInputs) const override;
  void destroy() override { delete this; }
 private:
  int hidden_;
  int head_number_;
  int head_size_;
  float scale_;
  bool ban_fp16_;
 };
 #endif
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@ -0,0 +1,150 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <cuda_runtime.h>
 #include <stdio.h>
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
 // Dynamic Plugin below.
 #if IS_TRT_VERSION_GE(6000)
 int SkipLayerNormPluginDynamic::initialize() {
  cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
  cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
             cudaMemcpyHostToDevice);
  cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
  cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
             cudaMemcpyHostToDevice);
  return 0;
 }
 size_t SkipLayerNormPluginDynamic::getSerializationSize() const { return 0; }
 void SkipLayerNormPluginDynamic::serialize(void *buffer) const {}
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
    nvinfer1::IExprBuilder &expr_builder) {
  PADDLE_ENFORCE_EQ(
      inputs[0].nbDims, 5,
      platform::errors::InvalidArgument(
          "The Input dim of the SkipLayernorm should be 5, but it's (%d) now.",
          inputs[0].nbDims));
  return inputs[0];
 }
 bool SkipLayerNormPluginDynamic::supportsFormatCombination(
    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
    int nb_outputs) {
  PADDLE_ENFORCE_NOT_NULL(
      in_out, platform::errors::InvalidArgument(
                  "The input of swish plugin shoule not be nullptr."));
  PADDLE_ENFORCE_LT(
      pos, nb_inputs + nb_outputs,
      platform::errors::InvalidArgument("The pos(%d) should be less than the "
                                        "num(%d) of the input and the output.",
                                        pos, nb_inputs + nb_outputs));
  const nvinfer1::PluginTensorDesc &in = in_out[pos];
  if (pos == 0) {
 #ifdef SUPPORTS_CUDA_FP16
    if (ban_fp16_) {
      return (in.type == nvinfer1::DataType::kFLOAT) &&
             (in.format == nvinfer1::TensorFormat::kLINEAR);
    } else {
      return (in.type == nvinfer1::DataType::kFLOAT ||
              in.type == nvinfer1::DataType::kHALF) &&
             (in.format == nvinfer1::TensorFormat::kLINEAR);
    }
 #else
    return (in.type == nvinfer1::DataType::kFLOAT) &&
           (in.format == nvinfer1::TensorFormat::kLINEAR);
 #endif
  }
  const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
  if (pos == 1) {
    return in.type == prev.type && in.format == prev.format;
  }
  // output
  return in.type == prev.type && in.format == prev.format;
 }
 nvinfer1::DataType SkipLayerNormPluginDynamic::getOutputDataType(
    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
  PADDLE_ENFORCE_EQ(index, 0,
                    platform::errors::InvalidArgument(
                        "The SkipLayerNorm Plugin only has one input, so the "
                        "index value should be 0, but get %d.",
                        index));
  PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT ||
                     input_types[0] == nvinfer1::DataType::kHALF),
                    true, platform::errors::InvalidArgument(
                              "The input type should be half or float"));
  return input_types[0];
 }
 int SkipLayerNormPluginDynamic::enqueue(
    const nvinfer1::PluginTensorDesc *input_desc,
    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
    void *const *outputs, void *workspace, cudaStream_t stream) {
  auto input_dims = input_desc[0].dims;
  size_t num = ProductDim(input_dims);
  int hidden = input_dims.d[2];
  auto input_type = input_desc[0].type;
  if (input_type == nvinfer1::DataType::kFLOAT) {
    const float *input1 = static_cast<const float *>(inputs[0]);
    const float *input2 = static_cast<const float *>(inputs[1]);
    float *output = static_cast<float *>(outputs[0]);
    operators::math::SkipLayerNormFunctor<float> skip_layer_norm_func;
    skip_layer_norm_func(num, hidden, input1, input2, scale_gpu_, bias_gpu_,
                         output, eps_, stream);
  } else if (input_type == nvinfer1::DataType::kHALF) {
 #ifdef SUPPORTS_CUDA_FP16
    const half *input1 = static_cast<const half *>(inputs[0]);
    const half *input2 = static_cast<const half *>(inputs[1]);
    half *output = static_cast<half *>(outputs[0]);
    operators::math::SkipLayerNormFunctor<half> skip_layer_norm_func;
    skip_layer_norm_func(num, hidden, input1, input2, scale_gpu_, bias_gpu_,
                         output, static_cast<half>(eps_), stream);
 #else
    PADDLE_THROW(platform::errors::Fatal(
        "The cuda archs you specific should greater than 600."));
 #endif
  } else {
    PADDLE_THROW(platform::errors::Fatal(
        "The SkipLayerNorm TRT Plugin's input type should be float or half."));
  }
  return cudaGetLastError() != cudaSuccess;
 }
 #endif
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@ -0,0 +1,102 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
 #if IS_TRT_VERSION_GE(6000)
 class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
 public:
  explicit SkipLayerNormPluginDynamic(float* bias, float* scale, int bias_size,
                                      int scale_size, const float eps,
                                      bool ban_fp16)
      : bias_(bias),
        scale_(scale),
        bias_size_(bias_size),
        scale_size_(scale_size),
        eps_(eps),
        ban_fp16_(ban_fp16) {}
  SkipLayerNormPluginDynamic(void const* serialData, size_t serialLength) {}
  nvinfer1::IPluginV2DynamicExt* clone() const override {
    return new SkipLayerNormPluginDynamic(bias_, scale_, bias_size_,
                                          scale_size_, eps_, ban_fp16_);
  }
  const char* getPluginType() const override { return "skip_layernorm_plugin"; }
  int getNbOutputs() const override { return 1; }
  int initialize() override;
  size_t getSerializationSize() const override;
  void serialize(void* buffer) const override;
  nvinfer1::DimsExprs getOutputDimensions(
      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
      nvinfer1::IExprBuilder& expr_builder) override;
  bool supportsFormatCombination(int pos,
                                 const nvinfer1::PluginTensorDesc* inOut,
                                 int nbInputs, int nbOutputs) override;
  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                       int nbInputs,
                       const nvinfer1::DynamicPluginTensorDesc* out,
                       int nbOutputs) override {}
  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                          int nbInputs,
                          const nvinfer1::PluginTensorDesc* outputs,
                          int nbOutputs) const override {
    return 0;
  }
  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
              const nvinfer1::PluginTensorDesc* outputDesc,
              const void* const* inputs, void* const* outputs, void* workspace,
              cudaStream_t stream) override;
  nvinfer1::DataType getOutputDataType(int index,
                                       const nvinfer1::DataType* inputTypes,
                                       int nbInputs) const override;
  void destroy() override { delete this; }
 private:
  float* bias_;
  float* scale_;
  float* bias_gpu_;
  float* scale_gpu_;
  int bias_size_;
  int scale_size_;
  float eps_;
  bool ban_fp16_;
 };
 #endif
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@ -16,10 +16,12 @@
 #include <NvInfer.h>
 #include <cstring>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
@ -112,6 +114,72 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
  std::vector<nvinfer1::ITensor*> inputs_;
 };
 #if IS_TRT_VERSION_GE(6000)
 class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
 public:
  DynamicPluginTensorRT() {}
  DynamicPluginTensorRT(const void* serialized_data, size_t length) {}
  // The Func in IPluginExt or IpluginExtV2
  virtual const char* getPluginVersion() const { return "1"; }
  virtual const char* getPluginType() const = 0;
  int getNbOutputs() const { return 1; }
  int initialize() override { return 0; }
  void terminate() override{};
  virtual size_t getSerializationSize() const = 0;
  virtual void serialize(void* buffer) const = 0;
  // The Func in IPluginV2
  nvinfer1::IPluginV2DynamicExt* clone() const = 0;
  virtual nvinfer1::DimsExprs getOutputDimensions(
      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
      nvinfer1::IExprBuilder& expr_builder) = 0;  // NOLINT
  virtual bool supportsFormatCombination(
      int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
      int nb_outputs) = 0;
  virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                               int nb_inputs,
                               const nvinfer1::DynamicPluginTensorDesc* out,
                               int nb_outputs) = 0;
  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                          int nb_inputs,
                          const nvinfer1::PluginTensorDesc* outputs,
                          int nb_outputs) const override {
    return 0;
  }
  virtual int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
                      const nvinfer1::PluginTensorDesc* output_desc,
                      const void* const* inputs, void* const* outputs,
                      void* workspace, cudaStream_t stream) = 0;
  virtual nvinfer1::DataType getOutputDataType(
      int index, const nvinfer1::DataType* input_types,
      int nb_inputs) const = 0;
  void setPluginNamespace(const char* plugin_namespace) override {
    name_space_ = plugin_namespace;
  }
  const char* getPluginNamespace() const override {
    return name_space_.c_str();
  }
  virtual void destroy() = 0;
 protected:
  void deserializeBase(void const*& serial_data,  // NOLINT
                       size_t& serial_length);    // NOLINT
  size_t getBaseSerializationSize() const;
  void serializeBase(void*& buffer) const;  // NOLINT
 private:
  std::string name_space_{"paddle_trt"};
  std::string plugin_base_{"plugin_dynamic"};
 };
 #endif
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@ -349,9 +349,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(trt_resnext_test SRCS trt_resnext_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
    inference_analysis_test(trt_bert_test SRCS trt_bert_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
            ARGS --infer_model=${BERT_INSTALL_DIR}/model)
    inference_analysis_test(trt_fc_prelu_test SRCS trt_fc_prelu_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@ -367,6 +364,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
    set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant_small_model")
    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
        inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "quant_small_model.tar.gz")
@ -382,6 +380,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
            ARGS --infer_model=${TEST_TRT_DYNAMIC_MODEL})
    set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test")
    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL})
        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz")
    endif()
    inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
 endif()
 set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
--- a/Show More
+++ b/Show More