Fix registering trt plugin (#25744)

* develop dynamic shape serilization * add test param for gelu * fix bugs * delete redundant comments * debug * fix conflict. test=develop * fix bug. test=develop * add trt dynamic shape serialized support * fix ernie serialized bug test=develop * fix codestyle test=develop * fix bug test=develop * fix bug.test=develop * modify cmakelist test=develop * fix bug test=develop * fix error message. test=develop * fix trt register plugin based on pr#25003 * add trt dynload * fix deserialization bug of not finding plugin registration * refine code style * recover engine key in tensorrt_subgraph_pass * for ci coverage * add unittest for deserialization Co-authored-by: haozech <chenhaoze94@gmail.com>
5 years ago · b717895f64
parent a697e94693
commit b717895f64
19 changed files with 625 additions and 150 deletions
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
 http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -83,23 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
    nvinfer1::ILayer* layer = nullptr;

    if (engine_->with_dynamic_shape()) {
-      auto use_fp16 = engine_->WithFp16();
      plugin::DynamicPluginTensorRT* plugin = nullptr;
-      if (use_fp16) {
-#ifdef SUPPORTS_CUDA_FP16
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic<half>(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps);
-#else
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps);
-#endif
-      } else {
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps);
-      }
+      plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
+          input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
+          eps);
      layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
    } else {
      PADDLE_THROW(platform::errors::Fatal(
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@ -200,9 +200,23 @@ class TensorRTEngine {
  void Deserialize(const std::string& engine_serialized_data) {
    freshDeviceId();
    infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
-    infer_engine_.reset(runtime->deserializeCudaEngine(
-        engine_serialized_data.c_str(), engine_serialized_data.size(),
-        &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
+    if (with_dynamic_shape_) {
+#if IS_TRT_VERSION_GE(6000)
+      infer_engine_.reset(runtime->deserializeCudaEngine(
+          engine_serialized_data.c_str(), engine_serialized_data.size(),
+          nullptr));
+#else
+
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "To enable dynamic shape support, the TensorRT version should be "
+          "greater than 6.0.0"));
+
+#endif
+    } else {
+      infer_engine_.reset(runtime->deserializeCudaEngine(
+          engine_serialized_data.c_str(), engine_serialized_data.size(),
+          &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
+    }
    PADDLE_ENFORCE(infer_engine_ != nullptr,
                   "build cuda engine failed when deserialize engine info.!");
  }
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@ -56,6 +56,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
  return static_cast<nvinfer1::IRuntime*>(
      dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
 }
+static nvinfer1::IPluginRegistry* getPluginRegistry() {
+  return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
+}

 // A logger for create TensorRT infer builder.
 class NaiveLogger : public nvinfer1::ILogger {
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@ -33,53 +33,29 @@ namespace plugin {

 template <typename T>
 int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
-  int nb_emb = embs_.size();
-  std::vector<void *> ptr_vector(nb_emb);
-  std::vector<std::vector<half>> emb_fp16(nb_emb);
-
-  if (sizeof(T) == sizeof(float)) {
-    // FP32
-    for (int i = 0; i < nb_emb; ++i) {
-      ptr_vector[i] = embs_[i];
-    }
-  } else {
-    // FP16
-    for (int i = 0; i < nb_emb; ++i) {
-      auto emb_size = emb_sizes_[i];
-      auto &tmp = emb_fp16[i];
-      tmp.resize(emb_size);
-
-      for (int j = 0; j < emb_size; ++j) {
-        tmp[j] = static_cast<half>(embs_[i][j]);
-      }
-      ptr_vector[i] = tmp.data();
-    }
-  }
  embs_gpu_.resize(embs_.size());
  for (int i = 0; i < embs_.size(); i++) {
-    cudaMalloc(&embs_gpu_[i], sizeof(T) * emb_sizes_[i]);
-    cudaMemcpy(embs_gpu_[i], ptr_vector[i], emb_sizes_[i] * sizeof(T),
-               cudaMemcpyHostToDevice);
+    if (embs_[i]) {
+      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
+      cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
+                 cudaMemcpyHostToDevice);
+    }
  }

-  cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
-  cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
-             cudaMemcpyHostToDevice);
-  cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-  cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
-             cudaMemcpyHostToDevice);
-
-  return 0;
-}
+  if (bias_) {
+    cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
+    cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+  }
+  if (scale_) {
+    cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
+    cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+  }

-template <typename T>
-size_t EmbEltwiseLayernormPluginDynamic<T>::getSerializationSize() const {
  return 0;
 }

-template <typename T>
-void EmbEltwiseLayernormPluginDynamic<T>::serialize(void *buffer) const {}
-
 template <typename T>
 nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@ -44,8 +44,42 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
        hidden_size_(hidden_size),
        eps_(eps) {}

-  EmbEltwiseLayernormPluginDynamic(void const* serialData,
-                                   size_t serialLength) {}
+  EmbEltwiseLayernormPluginDynamic(void const* serial_data,
+                                   size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
+
+    embs_gpu_.resize(emb_sizes_.size());
+    embs_.resize(emb_sizes_.size());
+    for (size_t i = 0; i < emb_sizes_.size(); i++) {
+      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
+      cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float),
+                 cudaMemcpyHostToDevice);
+      reinterpret_cast<char const*&>(serial_data) +=
+          emb_sizes_[i] * sizeof(float);
+      serial_length -= emb_sizes_[i] * sizeof(float);
+      embs_[i] = nullptr;
+    }
+    DeserializeValue(&serial_data, &serial_length, &bias_size_);
+    DeserializeValue(&serial_data, &serial_length, &scale_size_);
+
+    cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
+    cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+    bias_ = nullptr;
+    reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
+    serial_length -= bias_size_ * sizeof(float);
+
+    cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
+    cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+    scale_ = nullptr;
+    reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
+    serial_length -= scale_size_ * sizeof(float);
+
+    DeserializeValue(&serial_data, &serial_length, &hidden_size_);
+    DeserializeValue(&serial_data, &serial_length, &eps_);
+  }
+
  nvinfer1::IPluginV2DynamicExt* clone() const override {
    return new EmbEltwiseLayernormPluginDynamic(
        embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
@ -58,36 +92,66 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
  int getNbOutputs() const override { return 1; }
  int initialize() override;

-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override {
+    int sum_num = 0;
+    sum_num += SerializedSize(emb_sizes_);
+
+    for (size_t i = 0; i < emb_sizes_.size(); i++) {
+      sum_num += emb_sizes_[i] * sizeof(float);
+    }
+
+    sum_num += SerializedSize(bias_size_);
+    sum_num += SerializedSize(scale_size_);
+
+    sum_num += (bias_size_ + scale_size_) * sizeof(float);
+    sum_num += SerializedSize(hidden_size_);
+    sum_num += SerializedSize(eps_);
+    // sum_num += SerializedSize(with_fp16_);
+
+    return sum_num;
+  }
+
+  void serialize(void* buffer) const override {
+    // SerializeValue(&buffer, with_fp16_);
+    SerializeValue(&buffer, emb_sizes_);
+    for (size_t i = 0; i < emb_sizes_.size(); i++) {
+      SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]);
+    }
+    SerializeValue(&buffer, bias_size_);
+    SerializeValue(&buffer, scale_size_);
+    SerializeCudaPointer(&buffer, bias_gpu_, bias_size_);
+    SerializeCudaPointer(&buffer, scale_gpu_, scale_size_);
+    SerializeValue(&buffer, hidden_size_);
+    SerializeValue(&buffer, eps_);
+  }

  nvinfer1::DimsExprs getOutputDimensions(
      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
      nvinfer1::IExprBuilder& expr_builder) override;

  bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;

  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                       const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}

  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
    return 0;
  }

-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
              const void* const* inputs, void* const* outputs, void* workspace,
              cudaStream_t stream) override;
  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;

  void destroy() override { delete this; }

@ -99,7 +163,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
  // data on devices
  float* bias_gpu_;
  float* scale_gpu_;
-  std::vector<T*> embs_gpu_;
+  std::vector<float*> embs_gpu_;

  std::vector<int> emb_sizes_;
  int bias_size_;
@ -107,6 +171,49 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
  int hidden_size_;
  float eps_;
 };
+
+class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  EmbEltwiseLayernormPluginV2Creator() {}
+  const char* getPluginName() const override {
+    return "fused_embedding_eltwise_layernorm_plugin";
+  }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    return new EmbEltwiseLayernormPluginDynamic<float>(serial_data,
+                                                       serial_length);
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator);
+
 #endif
 }  // namespace plugin
 }  // namespace tensorrt
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
@ -132,9 +132,6 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,

 // Dynamic Plugin below.
 #if IS_TRT_VERSION_GE(6000)
-size_t GeluPluginDynamic::getSerializationSize() const { return 0; }
-
-void GeluPluginDynamic::serialize(void* buffer) const {}

 nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(
    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
@ -30,8 +30,8 @@ class GeluPlugin : public PluginTensorRT {

  // It was used for tensorrt deserialization.
  // It should not be called by users.
-  GeluPlugin(void const* serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
+  GeluPlugin(void const* serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
  }

  ~GeluPlugin() {}
@ -43,8 +43,8 @@ class GeluPlugin : public PluginTensorRT {
  bool supportsFormat(nvinfer1::DataType type,
                      nvinfer1::PluginFormat format) const override;
  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nbInputDims) override;
-  int enqueue(int batchSize, const void* const* inputs, void** outputs,
+                                     int nb_input_dims) override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
              void* workspace, cudaStream_t stream) override;

 protected:
@ -64,7 +64,7 @@ class GeluPlugin : public PluginTensorRT {
 class GeluPluginDynamic : public DynamicPluginTensorRT {
 public:
  GeluPluginDynamic() {}
-  GeluPluginDynamic(void const* serialData, size_t serialLength) {}
+  GeluPluginDynamic(void const* serial_data, size_t serial_length) {}

  ~GeluPluginDynamic() {}
  nvinfer1::IPluginV2DynamicExt* clone() const override {
@ -75,39 +75,79 @@ class GeluPluginDynamic : public DynamicPluginTensorRT {
  int getNbOutputs() const override { return 1; }
  int initialize() override { return 0; }

-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override { return 0; }
+  void serialize(void* buffer) const override {}

  nvinfer1::DimsExprs getOutputDimensions(
-      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) override;
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) override;

  bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;

  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                       const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}

  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
    return 0;
  }

-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
              const void* const* inputs, void* const* outputs, void* workspace,
              cudaStream_t stream) override;
  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;

  void destroy() override { delete this; }
 };
+
+class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  GeluPluginV2Creator() {}
+  const char* getPluginName() const override { return "gelu_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new GeluPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator);
 #endif

 }  // namespace plugin
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@ -152,10 +152,6 @@ inline void TransposeQKV(const int batch, const int seq_len,

 int QkvToContextPluginDynamic::initialize() { return 0; }

-size_t QkvToContextPluginDynamic::getSerializationSize() const { return 0; }
-
-void QkvToContextPluginDynamic::serialize(void *buffer) const {}
-
 nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
    nvinfer1::IExprBuilder &expr_builder) {
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
@ -1,3 +1,17 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -37,7 +51,13 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
        scale_(scale),
        ban_fp16_(ban_fp16) {}

-  QkvToContextPluginDynamic(void const* serialData, size_t serialLength) {}
+  QkvToContextPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &hidden_);
+    DeserializeValue(&serial_data, &serial_length, &head_number_);
+    DeserializeValue(&serial_data, &serial_length, &head_size_);
+    DeserializeValue(&serial_data, &serial_length, &scale_);
+    DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
+  }
  nvinfer1::IPluginV2DynamicExt* clone() const override {
    return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_,
                                         scale_, ban_fp16_);
@ -47,26 +67,36 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
  int getNbOutputs() const override { return 1; }
  int initialize() override;

-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override {
+    return SerializedSize(hidden_) + SerializedSize(head_number_) +
+           SerializedSize(head_size_) + SerializedSize(scale_) +
+           SerializedSize(ban_fp16_);
+  }
+  void serialize(void* buffer) const override {
+    SerializeValue(&buffer, hidden_);
+    SerializeValue(&buffer, head_number_);
+    SerializeValue(&buffer, head_size_);
+    SerializeValue(&buffer, scale_);
+    SerializeValue(&buffer, ban_fp16_);
+  }

  nvinfer1::DimsExprs getOutputDimensions(
      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
      nvinfer1::IExprBuilder& expr_builder) override;

  bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;

  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                       const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}

  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
    return 0;
  }

@ -75,8 +105,8 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
              const void* const* inputs, void* const* outputs, void* workspace,
              cudaStream_t stream) override;
  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;

  void destroy() override { delete this; }

@ -87,6 +117,45 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
  float scale_;
  bool ban_fp16_;
 };
+
+class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  QkvToContextPluginV2Creator() {}
+  const char* getPluginName() const override { return "qkv_to_context_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new QkvToContextPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator);
 #endif

 }  // namespace plugin
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@ -32,18 +32,14 @@ namespace plugin {

 int SkipLayerNormPluginDynamic::initialize() {
  cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
-  cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
+  cudaMemcpy(bias_gpu_, bias_.data(), bias_size_ * sizeof(float),
             cudaMemcpyHostToDevice);
  cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-  cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
+  cudaMemcpy(scale_gpu_, scale_.data(), scale_size_ * sizeof(float),
             cudaMemcpyHostToDevice);
  return 0;
 }

-size_t SkipLayerNormPluginDynamic::getSerializationSize() const { return 0; }
-
-void SkipLayerNormPluginDynamic::serialize(void *buffer) const {}
-
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
    nvinfer1::IExprBuilder &expr_builder) {
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@ -29,61 +29,84 @@ namespace plugin {
 #if IS_TRT_VERSION_GE(6000)
 class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
 public:
-  explicit SkipLayerNormPluginDynamic(float* bias, float* scale, int bias_size,
-                                      int scale_size, const float eps,
-                                      bool ban_fp16)
-      : bias_(bias),
-        scale_(scale),
-        bias_size_(bias_size),
+  explicit SkipLayerNormPluginDynamic(const float* bias, const float* scale,
+                                      int bias_size, int scale_size,
+                                      const float eps, bool ban_fp16)
+      : bias_size_(bias_size),
        scale_size_(scale_size),
        eps_(eps),
-        ban_fp16_(ban_fp16) {}
-  SkipLayerNormPluginDynamic(void const* serialData, size_t serialLength) {}
+        ban_fp16_(ban_fp16) {
+    bias_.resize(bias_size);
+    scale_.resize(scale_size);
+    std::copy(bias, bias + bias_size, bias_.data());
+    std::copy(scale, scale + scale_size, scale_.data());
+  }
+  SkipLayerNormPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &bias_);
+    DeserializeValue(&serial_data, &serial_length, &scale_);
+    DeserializeValue(&serial_data, &serial_length, &bias_size_);
+    DeserializeValue(&serial_data, &serial_length, &scale_size_);
+    DeserializeValue(&serial_data, &serial_length, &eps_);
+    DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
+  }
+
  nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new SkipLayerNormPluginDynamic(bias_, scale_, bias_size_,
-                                          scale_size_, eps_, ban_fp16_);
+    return new SkipLayerNormPluginDynamic(
+        bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
  }

  const char* getPluginType() const override { return "skip_layernorm_plugin"; }
  int getNbOutputs() const override { return 1; }
  int initialize() override;

-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override {
+    size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) +
+                      SerializedSize(bias_size_) + SerializedSize(scale_size_) +
+                      SerializedSize(eps_) + SerializedSize(eps_);
+    return ser_size;
+  }
+  void serialize(void* buffer) const override {
+    SerializeValue(&buffer, bias_);
+    SerializeValue(&buffer, scale_);
+    SerializeValue(&buffer, bias_size_);
+    SerializeValue(&buffer, scale_size_);
+    SerializeValue(&buffer, eps_);
+    SerializeValue(&buffer, ban_fp16_);
+  }

  nvinfer1::DimsExprs getOutputDimensions(
      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
      nvinfer1::IExprBuilder& expr_builder) override;

  bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;

  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                       const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}

  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
    return 0;
  }

-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
              const void* const* inputs, void* const* outputs, void* workspace,
              cudaStream_t stream) override;
  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;

  void destroy() override { delete this; }

 private:
-  float* bias_;
-  float* scale_;
+  std::vector<float> bias_;
+  std::vector<float> scale_;

  float* bias_gpu_;
  float* scale_gpu_;
@ -94,6 +117,45 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
  float eps_;
  bool ban_fp16_;
 };
+
+class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  SkipLayerNormPluginV2Creator() {}
+  const char* getPluginName() const override { return "skip_layernorm_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new SkipLayerNormPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator);
 #endif

 }  // namespace plugin
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@ -175,11 +175,24 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
  void serializeBase(void*& buffer) const;  // NOLINT

 private:
-  std::string name_space_{"paddle_trt"};
-  std::string plugin_base_{"plugin_dynamic"};
+  std::string name_space_;
+  std::string plugin_base_;
 };
 #endif

+template <typename T>
+class TrtPluginRegistrarV2 {
+ public:
+  TrtPluginRegistrarV2() { getPluginRegistry()->registerCreator(creator, ""); }
+
+ private:
+  T creator;
+};
+
+#define REGISTER_TRT_PLUGIN_V2(name)                                     \
+  static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
+      plugin_registrar_##name {}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@ -128,6 +128,12 @@ inline void DeserializeValue(void const** buffer, size_t* buffer_size,
  return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
 }

+template <typename T>
+inline void SerializeCudaPointer(void** buffer, T* value, int size) {
+  cudaMemcpy((*buffer), value, size * sizeof(T), cudaMemcpyDeviceToHost);
+  reinterpret_cast<char*&>(*buffer) += size * sizeof(T);
+}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@ -433,6 +433,25 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
+
+    set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
+    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL})
+        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
+    endif()
+
+    inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
+
+    set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_serialized/")
+    if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
+        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
+    endif()
+
+    inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_serialized)
+
 endif()

 set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@ -0,0 +1,146 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle {
+namespace inference {
+
+void run(const AnalysisConfig& config, std::vector<float>* out_data) {
+  auto predictor = CreatePaddlePredictor(config);
+  auto input_names = predictor->GetInputNames();
+
+  int run_batch = 1;
+  const int run_seq_len = 128;
+
+  std::vector<int64_t> tmp_input;
+  std::vector<float> tmp_four_input;
+  tmp_input.reserve(run_batch * run_seq_len);
+  tmp_four_input.reserve(run_batch * run_seq_len);
+
+  int64_t i0[run_seq_len] = {
+      1,    3558, 4,   75,  491, 89, 340, 313, 93,   4,   255,   10, 75,    321,
+      4095, 1902, 4,   134, 49,  75, 311, 14,  44,   178, 543,   15, 12043, 2,
+      75,   201,  340, 9,   14,  44, 486, 218, 1140, 279, 12043, 2};
+  int64_t i1[run_seq_len] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int64_t i2[run_seq_len] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                             30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
+  float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  // first input
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape({run_batch, run_seq_len, 1});
+  input_t->copy_from_cpu(i0);
+
+  // second input
+  auto input_t2 = predictor->GetInputTensor(input_names[1]);
+  input_t2->Reshape({run_batch, run_seq_len, 1});
+  input_t2->copy_from_cpu(i1);
+
+  // third input.
+  auto input_t3 = predictor->GetInputTensor(input_names[2]);
+  input_t3->Reshape({run_batch, run_seq_len, 1});
+  input_t3->copy_from_cpu(i2);
+
+  auto input_t4 = predictor->GetInputTensor(input_names[3]);
+  input_t4->Reshape({run_batch, run_seq_len, 1});
+  input_t4->copy_from_cpu(i3);
+
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_t->copy_to_cpu(out_data->data());
+}
+
+void trt_ernie(bool with_fp16, std::vector<float> result) {
+  AnalysisConfig config;
+  std::string model_dir = FLAGS_infer_model;
+  SetConfig(&config, model_dir, true /* use_gpu */);
+
+  config.SwitchUseFeedFetchOps(false);
+
+  int head_number = 12;
+  int batch = 1;
+  int min_seq_len = 1;
+  int max_seq_len = 128;
+  int opt_seq_len = 128;
+
+  std::vector<int> min_shape = {batch, min_seq_len, 1};
+  std::vector<int> max_shape = {batch, max_seq_len, 1};
+  std::vector<int> opt_shape = {batch, opt_seq_len, 1};
+  // Set the input's min, max, opt shape
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {"read_file_0.tmp_0", min_shape},
+      {"read_file_0.tmp_1", min_shape},
+      {"read_file_0.tmp_2", min_shape},
+      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {"read_file_0.tmp_0", max_shape},
+      {"read_file_0.tmp_1", max_shape},
+      {"read_file_0.tmp_2", max_shape},
+      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+  std::map<std::string, std::vector<int>> opt_input_shape = {
+      {"read_file_0.tmp_0", opt_shape},
+      {"read_file_0.tmp_1", opt_shape},
+      {"read_file_0.tmp_2", opt_shape},
+      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+
+  auto precision = AnalysisConfig::Precision::kFloat32;
+  if (with_fp16) {
+    precision = AnalysisConfig::Precision::kHalf;
+  }
+  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
+  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                opt_input_shape);
+  std::vector<float> out_data;
+  run(config, &out_data);
+  for (size_t i = 0; i < out_data.size(); i++) {
+    EXPECT_NEAR(result[i], out_data[i], 1e-6);
+  }
+}
+
+TEST(AnalysisPredictor, no_fp16) {
+  std::vector<float> result = {0.597841, 0.219972, 0.182187};
+  trt_ernie(false, result);
+}
+
+TEST(AnalysisPredictor, fp16) {
+#ifdef SUPPORTS_CUDA_FP16
+  std::vector<float> result = {0.598336, 0.219558, 0.182106};
+  trt_ernie(true, result);
+#endif
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@ -120,7 +120,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
  if (with_fp16) {
    precision = AnalysisConfig::Precision::kHalf;
  }
-  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, true);
+  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, false);
  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
                                opt_input_shape);
  std::vector<float> out_data;
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <cuda_runtime.h>
+#include <algorithm>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@ -13,18 +13,62 @@
   limitations under the License. */

 #include "paddle/fluid/platform/dynload/tensorrt.h"
+#include <string>

 namespace paddle {
 namespace platform {
 namespace dynload {

 std::once_flag tensorrt_dso_flag;
-void *tensorrt_dso_handle;
+void* tensorrt_dso_handle;

 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);

+void* GetTensorRtHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string dso_name = "libnvinfer.dylib";
+#elif defined(_WIN32)
+  std::string dso_name = "nvinfer.dll";
+#else
+  std::string dso_name = "libnvinfer.so";
+#endif
+
+#if !defined(_WIN32)
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+#else
+  int dynload_flags = 0;
+#endif  // !_WIN32
+
+  void* dso_handle = dlopen(dso_name.c_str(), dynload_flags);
+
+  if (nullptr == dso_handle) {
+    auto error_msg =
+        "TensorRT dynamic library (%s) that Paddle depends on is not "
+        "configured correctly. (error code is %s)\n"
+        "  Suggestions:\n"
+        "  1. Check if TensorRT "
+        "is installed correctly and its version is matched with paddlepaddle "
+        "you installed.\n"
+        "  2. Configure TensorRT dynamic library environment variables as "
+        "follows:\n"
+        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
+        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
+        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
+        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
+        "impossible unless System Integrity Protection (SIP) is disabled.]";
+#if !defined(_WIN32)
+    auto errorno = dlerror();
+#else
+    auto errorno = GetLastError();
+#endif  // !_WIN32
+    std::cerr << string::Sprintf(error_msg, dso_name, errorno);
+  }
+
+  return dso_handle;
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@ -27,6 +27,8 @@ namespace paddle {
 namespace platform {
 namespace dynload {

+void* GetTensorRtHandle();
+
 extern std::once_flag tensorrt_dso_flag;
 extern void* tensorrt_dso_handle;

@ -36,8 +38,7 @@ extern void* tensorrt_dso_handle;
    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
      using tensorrt_func = decltype(&::__name);                              \
      std::call_once(tensorrt_dso_flag, []() {                                \
-        tensorrt_dso_handle =                                                 \
-            paddle::platform::dynload::GetTensorRtDsoHandle();                \
+        tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
        PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle,                          \
                                platform::errors::Unavailable(                \
                                    "Load tensorrt %s failed", #__name));     \
@ -53,7 +54,8 @@ extern void* tensorrt_dso_handle;

 #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
  __macro(createInferBuilder_INTERNAL);     \
-  __macro(createInferRuntime_INTERNAL);
+  __macro(createInferRuntime_INTERNAL);     \
+  __macro(getPluginRegistry);

 TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)