add TRT support for instance_norm op (#21928)

* add TRT support for instance_norm op
5 years ago · 50bee83f71
parent 3dbd4087fe
commit 50bee83f71
9 changed files with 385 additions and 27 deletions
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@ -938,6 +938,7 @@ USE_TRT_CONVERTER(conv2d_transpose);
 USE_TRT_CONVERTER(leaky_relu);
 USE_TRT_CONVERTER(shuffle_channel);
 USE_TRT_CONVERTER(swish);
 USE_TRT_CONVERTER(instance_norm);
 USE_TRT_CONVERTER(layer_norm);
 USE_TRT_CONVERTER(gelu);
 USE_TRT_CONVERTER(multihead_matmul);
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@ -3,7 +3,7 @@ nv_library(tensorrt_converter
           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc
+                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc
           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
--- a/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
@ -0,0 +1,75 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 class InstanceNormOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
    VLOG(4) << "convert fluid prelu op to tensorrt instance norm layer";
    framework::OpDesc op_desc(op, nullptr);
    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
    float eps = boost::get<float>(op_desc.GetAttr("epsilon"));
    auto* scale_var = scope.FindVar(op_desc.Input("Scale")[0]);
    auto* bias_var = scope.FindVar(op_desc.Input("Bias")[0]);
    PADDLE_ENFORCE_NOT_NULL(
        scale_var,
        platform::errors::InvalidArgument(
            "Input [Scale] of instance_norm op converter should not be null"));
    PADDLE_ENFORCE_NOT_NULL(
        bias_var,
        platform::errors::InvalidArgument(
            "Input [Bias] of instance_norm op converter should not be null"));
    auto* scale_tensor = scale_var->GetMutable<framework::LoDTensor>();
    auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
    PADDLE_ENFORCE_EQ(
        scale_tensor->numel(), bias_tensor->numel(),
        platform::errors::InvalidArgument(
            "Num of input [Scale] and [Bias] of instance_norm op converter "
            "should be equal. Got Scale num = %ld, but Bias num = %ld",
            scale_tensor->numel(), bias_tensor->numel()));
    auto* scale_d = scale_tensor->data<float>();
    auto* bias_d = bias_tensor->data<float>();
    std::vector<float> scale_v;
    std::vector<float> bias_v;
    for (int i = 0; i < scale_tensor->numel(); i++) {
      scale_v.push_back(scale_d[i]);
      bias_v.push_back(bias_d[i]);
    }
    plugin::InstanceNormPlugin* plugin =
        new plugin::InstanceNormPlugin(eps, scale_v, bias_v);
    plugin->getPluginType();
    nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, 1, plugin);
    auto output_name = op_desc.Output("Y")[0];
    RreplenishLayerAndOutput(layer, "instance_norm", {output_name}, test_mode);
  }
 };
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
 REGISTER_TRT_OP_CONVERTER(instance_norm, InstanceNormOpConverter);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@ -32,30 +32,33 @@ struct SimpleOpTypeSetTeller : public Teller {
  }
 private:
-  std::unordered_set<std::string> teller_set{{"mul",
+  std::unordered_set<std::string> teller_set{{
-                                              "conv2d",
+      "mul",
-                                              "pool2d",
+      "conv2d",
-                                              "relu",
+      "pool2d",
-                                              "softmax",
+      "relu",
-                                              "sigmoid",
+      "softmax",
-                                              "depthwise_conv2d",
+      "sigmoid",
-                                              "batch_norm",
+      "depthwise_conv2d",
-                                              "concat",
+      "batch_norm",
-                                              "tanh",
+      "concat",
-                                              "pad",
+      "tanh",
-                                              "elementwise_add",
+      "pad",
-                                              "elementwise_mul",
+      "elementwise_add",
-                                              "dropout",
+      "elementwise_mul",
-                                              "prelu",
+      "dropout",
-                                              "conv2d_transpose",
+      "prelu",
-                                              "leaky_relu",
+      "conv2d_transpose",
-                                              "fc",
+      "leaky_relu",
-                                              "shuffle_channel",
+      "fc",
-                                              "swish",
+      "shuffle_channel",
-                                              "split",
+      "swish",
-                                              "gelu",
+      "split",
-                                              "layer_norm",
+      "instance_norm",
-                                              "multihead_matmul"}};
+      "gelu",
      "layer_norm",
      "multihead_matmul",
  }};
 };
 bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@ -1,5 +1,5 @@
 nv_library(tensorrt_plugin
           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
           prelu_op_plugin.cu  trt_plugin_factory.cc gelu_op_plugin.cu
-           pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu
+           pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu instance_norm_op_plugin.cu
-           DEPS enforce tensorrt_engine prelu)
+           DEPS enforce tensorrt_engine prelu tensor)
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@ -0,0 +1,119 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <stdio.h>
 #include <cassert>
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
 cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype,
                                      cudnnDataType_t *cudnn_dtype) {
  switch (trt_dtype) {
    case nvinfer1::DataType::kFLOAT:
      *cudnn_dtype = CUDNN_DATA_FLOAT;
      break;
    case nvinfer1::DataType::kHALF:
      *cudnn_dtype = CUDNN_DATA_HALF;
      break;
    default:
      return CUDNN_STATUS_BAD_PARAM;
  }
  return CUDNN_STATUS_SUCCESS;
 }
 InstanceNormPlugin *CreateInstanceNormPluginDeserialize(const void *buffer,
                                                        size_t length) {
  return new InstanceNormPlugin(buffer, length);
 }
 REGISTER_TRT_PLUGIN("instance_norm_plugin",
                    CreateInstanceNormPluginDeserialize);
 int InstanceNormPlugin::initialize() {
  platform::dynload::cudnnCreate(&handle_);
  platform::dynload::cudnnCreateTensorDescriptor(&x_desc_);
  platform::dynload::cudnnCreateTensorDescriptor(&y_desc_);
  platform::dynload::cudnnCreateTensorDescriptor(&b_desc_);
  return 0;
 }
 nvinfer1::Dims InstanceNormPlugin::getOutputDimensions(
    int index, const nvinfer1::Dims *inputDims, int nbInputs) {
  assert(nbInputs == 1);
  assert(index < this->getNbOutputs());
  nvinfer1::Dims const &input_dims = inputDims[0];
  nvinfer1::Dims output_dims = input_dims;
  return output_dims;
 }
 int InstanceNormPlugin::enqueue(int batch_size, const void *const *inputs,
                                void **outputs, void *workspace,
                                cudaStream_t stream) {
  const auto &input_dims = this->getInputDims(0);
  PADDLE_ENFORCE_EQ(input_dims.nbDims, 3,
                    platform::errors::InvalidArgument(
                        "Input Dims should be 3 (except the batch), got %d",
                        input_dims.nbDims));
  int n = batch_size;
  int c = input_dims.d[0];
  int h = input_dims.d[1];
  int w = input_dims.d[2];
  scale_t.Resize(framework::make_ddim({batch_size, c}));
  bias_t.Resize(framework::make_ddim({batch_size, c}));
  int device_id;
  cudaGetDevice(&device_id);
  float *scale_d = scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
  float *bias_d = bias_t.mutable_data<float>(platform::CUDAPlace(device_id));
  for (int i = 0; i < batch_size; i++) {
    cudaMemcpyAsync(scale_d + i * c, scale_.data(), sizeof(float) * c,
                    cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(bias_d + i * c, bias_.data(), sizeof(float) * c,
                    cudaMemcpyHostToDevice, stream);
  }
  platform::dynload::cudnnSetTensor4dDescriptor(
      b_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, n * c, 1, 1);
  cudnnDataType_t cudnn_dtype;
  nvinfer1::DataType data_type = getDataType();
  convert_trt2cudnn_dtype(data_type, &cudnn_dtype);
  platform::dynload::cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
                                                cudnn_dtype, 1, n * c, h, w);
  platform::dynload::cudnnSetTensor4dDescriptor(y_desc_, CUDNN_TENSOR_NCHW,
                                                cudnn_dtype, 1, n * c, h, w);
  float alpha = 1;
  float beta = 0;
  platform::dynload::cudnnSetStream(handle_, stream);
  void const *x_ptr = inputs[0];
  void *y_ptr = outputs[0];
  platform::dynload::cudnnBatchNormalizationForwardTraining(
      handle_, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha, &beta, x_desc_,
      x_ptr, y_desc_, y_ptr, b_desc_, scale_d, bias_d, 1., nullptr, nullptr,
      eps_, nullptr, nullptr);
 }
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
@ -0,0 +1,103 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
 class InstanceNormPlugin : public PluginTensorRT {
 private:
  float eps_;
  std::vector<float> scale_;
  std::vector<float> bias_;
  framework::Tensor scale_t;
  framework::Tensor bias_t;
  cudnnHandle_t handle_;
  cudnnTensorDescriptor_t x_desc_, y_desc_, b_desc_;
 protected:
  size_t getSerializationSize() override {
    return getBaseSerializationSize() + SerializedSize(eps_) +
           SerializedSize(scale_) + SerializedSize(bias_);
  }
  // TRT will call this func when we need to serialize the configuration of
  // tensorrt.
  // It should not be called by users.
  void serialize(void *buffer) override {
    SerializeValue(&buffer, getPluginType());
    serializeBase(buffer);
    SerializeValue(&buffer, eps_);
    SerializeValue(&buffer, scale_);
    SerializeValue(&buffer, bias_);
  }
 public:
  explicit InstanceNormPlugin(const float eps, const std::vector<float> scale,
                              const std::vector<float> bias)
      : eps_(eps), scale_(scale), bias_(bias) {
    PADDLE_ENFORCE_EQ(scale.size(), bias.size(),
                      platform::errors::InvalidArgument(
                          "The instanceNorm's scale and bias should be the "
                          "same size. Got scale size = %d, but bias size = %d",
                          scale.size(), bias.size()));
  }
  // It was used for tensorrt deserialization.
  // It should not be called by users.
  InstanceNormPlugin(void const *serialData, size_t serialLength) {
    deserializeBase(serialData, serialLength);
    DeserializeValue(&serialData, &serialLength, &eps_);
    DeserializeValue(&serialData, &serialLength, &scale_);
    DeserializeValue(&serialData, &serialLength, &bias_);
  }
  ~InstanceNormPlugin() {}
  int initialize() override;
  InstanceNormPlugin *clone() const override {
    return new InstanceNormPlugin(eps_, scale_, bias_);
  }
  const char *getPluginType() const override { return "instance_norm_plugin"; }
  int getNbOutputs() const override { return 1; }
  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
                                     int nbInputDims) override;
  int enqueue(int batchSize, const void *const *inputs, void **outputs,
              void *workspace, cudaStream_t stream) override;
  bool supportsFormat(nvinfer1::DataType type,
                      nvinfer1::PluginFormat format) const override {
    return ((type == nvinfer1::DataType::kFLOAT ||
             type == nvinfer1::DataType::kHALF) &&
            (format == nvinfer1::PluginFormat::kNCHW));
  }
 };
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@ -321,6 +321,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
    if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL})
        inference_download_and_uncompress(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz")
    endif()
    set(TEST_INSTANCE_NORM_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_instance_norm_test")
    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL})
        inference_download_and_uncompress(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz")
    endif()
    inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@ -342,6 +346,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(trt_split_converter_test SRCS trt_split_converter_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
            ARGS --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/)
    inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
            ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/)
    inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
--- a/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc
@ -0,0 +1,50 @@
 /* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 namespace paddle {
 namespace inference {
 TEST(TensorRT, instance_norm) {
  std::string model_dir = FLAGS_infer_model + "/instance_norm";
  AnalysisConfig config;
  int batch_size = 4;
  config.EnableUseGpu(100, 0);
  config.SetModel(model_dir);
  config.SwitchUseFeedFetchOps(false);
  config.EnableTensorRtEngine(1 << 20, batch_size, 0,
                              AnalysisConfig::Precision::kFloat32, false);
  auto predictor = CreatePaddlePredictor(config);
  int length = 4;
  int input_num = batch_size * length;
  float *input = new float[input_num];
  memset(input, 1.0, input_num * sizeof(float));
  auto input_names = predictor->GetInputNames();
  auto input_t = predictor->GetInputTensor(input_names[0]);
  input_t->Reshape({batch_size, length});
  input_t->copy_from_cpu(input);
  ASSERT_TRUE(predictor->ZeroCopyRun());
 }
 }  // namespace inference
 }  // namespace paddle