detect tensorRT plugin fp16 in runtime (#27933)

* remove -DSUPPORTS_CUDA_FP16 in cuda.cmake

* comile with cuda9

* add some unittest

* notest;test=coverage

* add unittest for trt plugin swish && split

* update ernie unittest

* fix some error message

* remove repeated judgement of CUDA version in mbEltwiseLayerNormOpConverter

* fix comile errror when CUDA_ARCH_NAME < Pascal"

* fix comile error

* update unittest timeout

* compile with cuda9

* update error msg

* fix code style

* add some comments

* add define IF_CUDA_ARCH_SUPPORT_FP16

* rename IF_CUDA_ARCH_SUPPORT_FP16 to CUDA_ARCH_FP16_SUPPORTED
musl/disable_test_yolov3_temporarily
Shang Zhizhou 5 years ago committed by GitHub
parent c39da29db7
commit b9e76a0103
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -103,19 +103,10 @@ function(select_nvcc_arch_flags out_variable)
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DSUPPORTS_CUDA_FP16")
endif()
set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DSUPPORTS_CUDA_FP16")
endif()
set(cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DSUPPORTS_CUDA_FP16")
endif()
set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
@ -194,6 +185,10 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
endif()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DTRT_PLUGIN_FP16_AVALIABLE")
endif()
add_definitions("-DCUDA_VERSION_MAJOR=\"${CUDA_VERSION_MAJOR}\"")
add_definitions("-DCUDA_VERSION_MINOR=\"${CUDA_VERSION_MINOR}\"")
add_definitions("-DCUDA_TOOLKIT_ROOT_DIR=\"${CUDA_TOOLKIT_ROOT_DIR}\"")

@ -93,11 +93,12 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
if (engine_->with_dynamic_shape()) {
if (engine_->use_oss()) {
int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
PADDLE_ENFORCE_EQ(output_fp16, 1,
PADDLE_ENFORCE_EQ(
output_fp16, 1,
platform::errors::InvalidArgument(
"Only Precision::KHalf(fp16) is supported when infering "
"ernie(bert) model with config.EnableTensorRtOSS(). "
"But Precision::KFloat32 is setted."));
"Only Precision::KHalf(fp16) is supported when infering "
"ernie(bert) model with config.EnableTensorRtOSS(). "
"But Precision::KFloat32 is setted."));
const std::vector<nvinfer1::PluginField> fields{
{"bert_embeddings_layernorm_beta", bias,
nvinfer1::PluginFieldType::kFLOAT32,
@ -135,21 +136,23 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
plugin_inputs.emplace_back(engine_->GetITensor(
engine_->network()->getInput(2)->getName())); // cu_seqlens,
// eval_placeholder_2
auto max_seqlen_tensor = engine_->GetITensor(
engine_->network()->getInput(3)->getName());
auto max_seqlen_tensor =
engine_->GetITensor(engine_->network()->getInput(3)->getName());
auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
engine_, Shuffle, *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
engine_, Shuffle,
*const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
nvinfer1::Dims shape_dim;
shape_dim.nbDims = 1;
shape_dim.d[0] = -1;
shuffle_layer->setReshapeDimensions(shape_dim);
plugin_inputs.emplace_back(shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3
plugin_inputs.emplace_back(
shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3
auto creator = GetPluginRegistry()->getPluginCreator(
"CustomEmbLayerNormPluginDynamic", "2");
auto plugin_obj =
creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr);
auto plugin_obj = creator->createPlugin(
"CustomEmbLayerNormPluginDynamic", plugin_ptr);
auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
layer = plugin_layer;
@ -159,12 +162,13 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
{output_name, std::string("qkv_plugin_mask")},
test_mode);
} else {
bool use_fp16 = engine_->WithFp16();
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
plugin::DynamicPluginTensorRT* plugin = nullptr;
plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps, use_fp16);
eps, with_fp16);
layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},

@ -59,7 +59,10 @@ class GeluOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) {
#if IS_TRT_VERSION_GE(6000)
plugin::GeluPluginDynamic* plugin = new plugin::GeluPluginDynamic();
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::GeluPluginDynamic* plugin =
new plugin::GeluPluginDynamic(with_fp16);
layer = engine_->AddPluginV2(&input, input_num, plugin);
#else
PADDLE_THROW(platform::errors::Fatal(
@ -67,7 +70,9 @@ class GeluOpConverter : public OpConverter {
"your TRT version is no less than 6.0"));
#endif
} else {
plugin::GeluPlugin* plugin = new plugin::GeluPlugin();
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::GeluPlugin* plugin = new plugin::GeluPlugin(with_fp16);
layer = engine_->AddPlugin(&input, input_num, plugin);
}
auto output_name = op_desc.Output("Out")[0];

@ -87,7 +87,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
}
};
// [3, N, H] -> [N, 3, H]
auto transpose_bias_v2 = [](const float* src, float* dst, int N, int H) {
auto transpose_bias_v2 = [](const float* src, float* dst, int N,
int H) {
for (int i = 0; i < 3; ++i) {
for (int n = 0; n < N; ++n) {
for (int h = 0; h < H; ++h) {
@ -106,15 +107,16 @@ class MultiheadMatMulOpConverter : public OpConverter {
std::vector<float> bias_data_tmp;
bias_data_tmp.reserve(bias_t->numel());
memcpy(bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float));
memcpy(bias_data_tmp.data(), bias_data,
bias_t->numel() * sizeof(float));
transpose_bias_v2(bias_data_tmp.data(), bias_data, head_number,
head_size);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data),
static_cast<int32_t>(bias_t->numel())};
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n,
weight, bias);
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
n, weight, bias);
auto mask_tensor = engine_->GetITensor("qkv_plugin_mask");
@ -151,15 +153,17 @@ class MultiheadMatMulOpConverter : public OpConverter {
plugin_inputs.emplace_back(engine_->GetITensor(
engine_->network()->getInput(2)->getName())); // cu_seqlens,
// eval_placeholder_2
auto max_seqlen_tensor = engine_->GetITensor(
engine_->network()->getInput(3)->getName());
auto max_seqlen_tensor =
engine_->GetITensor(engine_->network()->getInput(3)->getName());
auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
engine_, Shuffle, *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
engine_, Shuffle,
*const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
nvinfer1::Dims shape_dim;
shape_dim.nbDims = 1;
shape_dim.d[0] = -1;
shuffle_layer->setReshapeDimensions(shape_dim);
plugin_inputs.emplace_back(shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3
plugin_inputs.emplace_back(
shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3
auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin);
@ -178,8 +182,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
static_cast<void*>(bias_data),
static_cast<size_t>(bias_t->numel())};
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n,
weight.get(), bias.get());
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
n, weight.get(), bias.get());
auto* fc_out = fc_layer->getOutput(0);
// add qkv to context
int head_size = all_head_size / head_number;
@ -188,10 +192,11 @@ class MultiheadMatMulOpConverter : public OpConverter {
std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.push_back(fc_out);
plugin_inputs.push_back(input_bias_qk);
bool ban_fp16 = engine_->disable_trt_plugin_fp16();
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::DynamicPluginTensorRT* plugin =
new plugin::QkvToContextPluginDynamic(hidden, head_number, head_size,
scale, ban_fp16);
new plugin::QkvToContextPluginDynamic(hidden, head_number,
head_size, scale, with_fp16);
layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin);
}
} else {

@ -76,8 +76,8 @@ class SkipLayerNormOpConverter : public OpConverter {
pluginPtr->nbFields = static_cast<int>(fields.size());
pluginPtr->fields = fields.data();
auto pluginObj =
creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr);
auto pluginObj = creator->createPlugin(
"CustomSkipLayerNormPluginDynamic", pluginPtr);
auto plugin_layer = engine_->network()->addPluginV2(
inputs.data(), inputs.size(), *pluginObj);
@ -85,10 +85,11 @@ class SkipLayerNormOpConverter : public OpConverter {
layer = plugin_layer;
} else {
float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
bool ban_fp16 = engine_->disable_trt_plugin_fp16();
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SkipLayerNormPluginDynamic* plugin =
new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
scale_size, eps, ban_fp16);
scale_size, eps, with_fp16);
layer = engine_->AddPluginV2(inputs.data(), 2, plugin);
}
} else {

@ -93,9 +93,10 @@ class SliceOpConverter : public OpConverter {
layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(),
plugin);
} else {
bool ban_fp16 = engine_->disable_trt_plugin_fp16();
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SlicePluginDynamic* plugin =
new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16);
new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16);
layer = engine_->AddPluginV2(&input, 1, plugin);
}
#else
@ -104,9 +105,10 @@ class SliceOpConverter : public OpConverter {
"your TRT version is no less than 6.0"));
#endif
} else {
bool ban_fp16 = engine_->disable_trt_plugin_fp16();
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SlicePlugin* plugin =
new plugin::SlicePlugin(starts, ends, axes, ban_fp16);
new plugin::SlicePlugin(starts, ends, axes, with_fp16);
layer = engine_->AddPlugin(&input, 1, plugin);
}

@ -86,8 +86,10 @@ class SplitOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) {
#if IS_TRT_VERSION_GE(6000)
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SplitPluginDynamic* plugin =
new plugin::SplitPluginDynamic(axis, output_lengths);
new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16);
layer = engine_->AddPluginV2(&input, input_num, plugin);
#else
PADDLE_THROW(platform::errors::Fatal(
@ -95,8 +97,10 @@ class SplitOpConverter : public OpConverter {
"your TRT version is no less than 6.0"));
#endif
} else {
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SplitPlugin* plugin =
new plugin::SplitPlugin(axis, output_lengths);
new plugin::SplitPlugin(axis, output_lengths, with_fp16);
layer = engine_->AddPlugin(&input, input_num, plugin);
}

@ -46,8 +46,10 @@ class StackOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) {
#if IS_TRT_VERSION_GE(6000)
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::StackPluginDynamic* plugin =
new plugin::StackPluginDynamic(axis, input_num);
new plugin::StackPluginDynamic(axis, input_num, with_fp16);
layer = engine_->AddPluginV2(inputs, input_num, plugin);
assert(layer != nullptr);
#else

@ -60,7 +60,10 @@ class SwishOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) {
#if IS_TRT_VERSION_GE(6000)
plugin::SwishPluginDynamic* plugin = new plugin::SwishPluginDynamic(beta);
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SwishPluginDynamic* plugin =
new plugin::SwishPluginDynamic(beta, with_fp16);
layer = engine_->AddPluginV2(&input, input_num, plugin);
#else
PADDLE_THROW(platform::errors::Fatal(
@ -68,7 +71,9 @@ class SwishOpConverter : public OpConverter {
"your TRT version is no less than 6.0"));
#endif
} else {
plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta);
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta, with_fp16);
layer = engine_->AddPlugin(&input, input_num, plugin);
}

@ -160,9 +160,9 @@ int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue(
}
template class EmbEltwiseLayernormPluginDynamicImpl<float>;
#ifdef SUPPORTS_CUDA_FP16
#ifdef TRT_PLUGIN_FP16_AVALIABLE
template class EmbEltwiseLayernormPluginDynamicImpl<half>;
#endif // SUPPORTS_CUDA_FP16
#endif
int EmbEltwiseLayernormPluginDynamic::initialize() {
impl_->initialize();

@ -8,7 +8,7 @@
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
// See the License for the specific language governing permissions and
// limitations under the License.
@ -105,18 +105,24 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
scale_size_(scale_size),
hidden_size_(hidden_size),
eps_(eps),
with_fp16_(with_fp16),
own_host_buff_(false) {
if (with_fp16) {
#ifdef SUPPORTS_CUDA_FP16
with_fp16_ = with_fp16;
if (with_fp16_) {
#ifdef TRT_PLUGIN_FP16_AVALIABLE
VLOG(1) << "TRT Plugin DataType selected. EmbEltwiseLayerNorm-->fp16";
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
#else
PADDLE_THROW(platform::errors::Fatal(
"Unsupported data type, current GPU doesn't support half."));
#endif // SUPPORTS_CUDA_FP16
"The Ernie(Bert) tensorRT plugin should be "
"complied with CUDA version >= 10.0 when running with fp16. "
"Please recomplie it or try to use fp32 by set "
"config.EnableTensorRtEngine(1 << 30, 1, 5, "
"AnalysisConfig::Precision::kFloat32, false, false) "));
#endif
} else {
VLOG(1) << "TRT Plugin DataType selected. EmbEltwiseLayerNorm-->fp32";
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
@ -160,14 +166,18 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
if (with_fp16_) {
#ifdef SUPPORTS_CUDA_FP16
#ifdef TRT_PLUGIN_FP16_AVALIABLE
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
#else
PADDLE_THROW(platform::errors::Fatal(
"Unsupported data type, current GPU doesn't support half."));
#endif // SUPPORTS_CUDA_FP16
"The Ernie(Bert) tensorRT plugin should be "
"complied with CUDA version >= 10.0 when running with fp16. "
"Please recomplie it or try to use fp32 by set "
"config.EnableTensorRtEngine(1 << 30, 1, 5, "
"AnalysisConfig::Precision::kFloat32, false, false) "));
#endif
} else {
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
@ -283,7 +293,6 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
int hidden_size_;
float eps_;
bool with_fp16_;
bool own_host_buff_{false};
EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr};
};

@ -17,6 +17,7 @@
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace inference {
@ -38,14 +39,14 @@ REGISTER_TRT_PLUGIN("gelu_plugin", CreateGeluPluginDeserialize);
bool GeluPlugin::supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const {
#ifdef SUPPORTS_CUDA_FP16
return ((type == nvinfer1::DataType::kFLOAT ||
type == nvinfer1::DataType::kHALF) &&
(format == nvinfer1::PluginFormat::kNCHW));
#else
return ((type == nvinfer1::DataType::kFLOAT) &&
(format == nvinfer1::PluginFormat::kNCHW));
#endif
if (with_fp16_) {
return ((type == nvinfer1::DataType::kFLOAT ||
type == nvinfer1::DataType::kHALF) &&
(format == nvinfer1::PluginFormat::kNCHW));
} else {
return ((type == nvinfer1::DataType::kFLOAT) &&
(format == nvinfer1::PluginFormat::kNCHW));
}
}
nvinfer1::Dims GeluPlugin::getOutputDimensions(int index,
@ -87,6 +88,7 @@ __device__ half do_tanh<half>(half a) {
template <typename T, unsigned TPB>
__global__ void no_exact_gelu_kernel(const T a, const T b, const T c, int n,
const T* input, T* output) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
const int idx = blockIdx.x * TPB + threadIdx.x;
if (idx < n) {
const T in = input[idx];
@ -94,6 +96,7 @@ __global__ void no_exact_gelu_kernel(const T a, const T b, const T c, int n,
const T cdf = a + a * do_tanh<T>(tmp);
output[idx] = in * cdf;
}
#endif
}
int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
@ -108,21 +111,18 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
auto type = getDataType();
if (type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32";
const float* input = static_cast<const float*>(inputs[0]);
float* output = static_cast<float*>(outputs[0]);
gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>(
kA, num, input, output);
} else if (type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16
VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16";
const half* input = static_cast<const half*>(inputs[0]);
half* output = static_cast<half*>(outputs[0]);
no_exact_gelu_kernel<half,
block_size><<<grid_size, block_size, 0, stream>>>(
kAT, kBT, kCT, num, input, output);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"The Gelu TRT Plugin's input type should be float or half."));
@ -155,14 +155,14 @@ bool GeluPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc& in = in_out[pos];
if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#else
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#endif
if (with_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
}
const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
// output
@ -189,21 +189,18 @@ int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32";
const float* input = static_cast<const float*>(inputs[0]);
float* output = static_cast<float*>(outputs[0]);
gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>(
kA, num, input, output);
} else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16
VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16";
const half* input = static_cast<const half*>(inputs[0]);
half* output = static_cast<half*>(outputs[0]);
no_exact_gelu_kernel<half,
block_size><<<grid_size, block_size, 0, stream>>>(
kAT, kBT, kCT, num, input, output);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"The Gelu TRT Plugin's input type should be float or half."));

@ -26,7 +26,7 @@ namespace plugin {
class GeluPlugin : public PluginTensorRT {
public:
GeluPlugin() {}
explicit GeluPlugin(const bool with_fp16) { with_fp16_ = with_fp16; }
// It was used for tensorrt deserialization.
// It should not be called by users.
@ -35,7 +35,7 @@ class GeluPlugin : public PluginTensorRT {
}
~GeluPlugin() {}
GeluPlugin* clone() const override { return new GeluPlugin(); }
GeluPlugin* clone() const override { return new GeluPlugin(with_fp16_); }
const char* getPluginType() const override { return "gelu_plugin"; }
int getNbOutputs() const override { return 1; }
@ -63,20 +63,26 @@ class GeluPlugin : public PluginTensorRT {
#if IS_TRT_VERSION_GE(6000)
class GeluPluginDynamic : public DynamicPluginTensorRT {
public:
GeluPluginDynamic() {}
GeluPluginDynamic(void const* serial_data, size_t serial_length) {}
explicit GeluPluginDynamic(const bool with_fp16) { with_fp16_ = with_fp16; }
GeluPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
}
~GeluPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new GeluPluginDynamic();
return new GeluPluginDynamic(with_fp16_);
}
const char* getPluginType() const override { return "gelu_plugin"; }
int getNbOutputs() const override { return 1; }
int initialize() override { return 0; }
size_t getSerializationSize() const override { return 0; }
void serialize(void* buffer) const override {}
size_t getSerializationSize() const override {
return SerializedSize(with_fp16_);
}
void serialize(void* buffer) const override {
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,

@ -109,7 +109,6 @@ inline void TransposeQKV(const int batch, const int seq_len,
}
}
#ifdef SUPPORTS_CUDA_FP16
inline void TransposeQKV(const int batch, const int seq_len,
const int head_size, const int head_num,
const half *input, half *output, cudaStream_t stream) {
@ -148,7 +147,6 @@ inline void TransposeQKV(const int batch, const int seq_len,
output);
}
}
#endif
int QkvToContextPluginDynamic::initialize() { return 0; }
@ -195,19 +193,19 @@ bool QkvToContextPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16
if (ban_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
if (with_fp16_) {
#ifdef TRT_PLUGIN_FP16_AVALIABLE
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
#else
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#endif
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
}
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
@ -247,6 +245,7 @@ int QkvToContextPluginDynamic::enqueue(
auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. QkvToContext-->fp32";
auto *multihead_temp_data = multihead_temp_tensor.mutable_data<float>(
platform::CUDAPlace(device_id));
auto *qkptr = multihead_temp_data;
@ -275,7 +274,8 @@ int QkvToContextPluginDynamic::enqueue(
head_number_, head_size_);
} else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16
#ifdef TRT_PLUGIN_FP16_AVALIABLE
VLOG(1) << "TRT Plugin DataType selected. QkvToContext-->fp16";
auto *multihead_temp_data =
multihead_temp_tensor.mutable_data<int16_t>( // NOLINT
platform::CUDAPlace(device_id));
@ -305,7 +305,11 @@ int QkvToContextPluginDynamic::enqueue(
head_number_, head_size_);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
"The Ernie(Bert) TensorRT Plugin should be "
"complied with CUDA version >= 10.0 when running with fp16. "
"Please recomplie it or try to use fp32 by set "
"config.SetTRTDynamicShapeInfo(min_input_shape, "
"max_input_shape, opt_input_shape, true"));
#endif
} else {
PADDLE_THROW(platform::errors::Fatal(

@ -44,23 +44,24 @@ namespace plugin {
class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
public:
explicit QkvToContextPluginDynamic(int hidden, int head_number, int head_size,
float scale, bool ban_fp16)
float scale, bool with_fp16)
: hidden_(hidden),
head_number_(head_number),
head_size_(head_size),
scale_(scale),
ban_fp16_(ban_fp16) {}
scale_(scale) {
with_fp16_ = with_fp16;
}
QkvToContextPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &hidden_);
DeserializeValue(&serial_data, &serial_length, &head_number_);
DeserializeValue(&serial_data, &serial_length, &head_size_);
DeserializeValue(&serial_data, &serial_length, &scale_);
DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_,
scale_, ban_fp16_);
scale_, with_fp16_);
}
const char* getPluginType() const override { return "qkv_to_context_plugin"; }
@ -70,14 +71,14 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
size_t getSerializationSize() const override {
return SerializedSize(hidden_) + SerializedSize(head_number_) +
SerializedSize(head_size_) + SerializedSize(scale_) +
SerializedSize(ban_fp16_);
SerializedSize(with_fp16_);
}
void serialize(void* buffer) const override {
SerializeValue(&buffer, hidden_);
SerializeValue(&buffer, head_number_);
SerializeValue(&buffer, head_size_);
SerializeValue(&buffer, scale_);
SerializeValue(&buffer, ban_fp16_);
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs getOutputDimensions(
@ -115,7 +116,6 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
int head_number_;
int head_size_;
float scale_;
bool ban_fp16_;
};
class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {

@ -66,19 +66,19 @@ bool SkipLayerNormPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16
if (ban_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
if (with_fp16_) {
#ifdef TRT_PLUGIN_FP16_AVALIABLE
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
#else
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#endif
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
}
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
@ -114,6 +114,7 @@ int SkipLayerNormPluginDynamic::enqueue(
auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. SkipLayerNorm-->fp32";
const float *input1 = static_cast<const float *>(inputs[0]);
const float *input2 = static_cast<const float *>(inputs[1]);
float *output = static_cast<float *>(outputs[0]);
@ -121,7 +122,8 @@ int SkipLayerNormPluginDynamic::enqueue(
skip_layer_norm_func(num, hidden, input1, input2, scale_gpu_, bias_gpu_,
output, eps_, stream);
} else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16
#ifdef TRT_PLUGIN_FP16_AVALIABLE
VLOG(1) << "TRT Plugin DataType selected. SkipLayerNorm-->fp16";
const half *input1 = static_cast<const half *>(inputs[0]);
const half *input2 = static_cast<const half *>(inputs[1]);
half *output = static_cast<half *>(outputs[0]);
@ -130,7 +132,11 @@ int SkipLayerNormPluginDynamic::enqueue(
output, static_cast<half>(eps_), stream);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
"The Ernie(Bert) tensorRT plugin should be "
"complied with CUDA version >= 10.0 when running with fp16. "
"Please recomplie it or try to use fp32 by set "
"config.SetTRTDynamicShapeInfo(min_input_shape, "
"max_input_shape, opt_input_shape, true"));
#endif
} else {
PADDLE_THROW(platform::errors::Fatal(

@ -31,11 +31,9 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
public:
explicit SkipLayerNormPluginDynamic(const float* bias, const float* scale,
int bias_size, int scale_size,
const float eps, bool ban_fp16)
: bias_size_(bias_size),
scale_size_(scale_size),
eps_(eps),
ban_fp16_(ban_fp16) {
const float eps, bool with_fp16)
: bias_size_(bias_size), scale_size_(scale_size), eps_(eps) {
with_fp16_ = with_fp16;
bias_.resize(bias_size);
scale_.resize(scale_size);
std::copy(bias, bias + bias_size, bias_.data());
@ -47,12 +45,12 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
DeserializeValue(&serial_data, &serial_length, &bias_size_);
DeserializeValue(&serial_data, &serial_length, &scale_size_);
DeserializeValue(&serial_data, &serial_length, &eps_);
DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override {
auto ptr = new SkipLayerNormPluginDynamic(
bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, with_fp16_);
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_;
return ptr;
@ -65,7 +63,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
size_t getSerializationSize() const override {
size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) +
SerializedSize(bias_size_) + SerializedSize(scale_size_) +
SerializedSize(eps_) + SerializedSize(eps_);
SerializedSize(eps_) + SerializedSize(with_fp16_);
return ser_size;
}
void serialize(void* buffer) const override {
@ -74,7 +72,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
SerializeValue(&buffer, bias_size_);
SerializeValue(&buffer, scale_size_);
SerializeValue(&buffer, eps_);
SerializeValue(&buffer, ban_fp16_);
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs getOutputDimensions(
@ -118,7 +116,6 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
int scale_size_;
float eps_;
bool ban_fp16_;
};
class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {

@ -59,8 +59,9 @@ __global__ void SliceKernel(int num, int dims, const T *input,
}
SlicePlugin::SlicePlugin(std::vector<int> starts, std::vector<int> ends,
std::vector<int> axes, bool ban_fp16)
: starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {
std::vector<int> axes, bool with_fp16)
: starts_(starts), ends_(ends), axes_(axes) {
with_fp16_ = with_fp16;
cudaEventCreate(&copy_event_);
cudaStreamCreate(&copy_stream_);
}
@ -70,7 +71,6 @@ SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &starts_);
DeserializeValue(&serial_data, &serial_length, &ends_);
DeserializeValue(&serial_data, &serial_length, &axes_);
DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
cudaEventCreate(&copy_event_);
cudaStreamCreate(&copy_stream_);
}
@ -82,19 +82,19 @@ SlicePlugin::~SlicePlugin() {
}
SlicePlugin *SlicePlugin::clone() const {
return new SlicePlugin(starts_, ends_, axes_, ban_fp16_);
return new SlicePlugin(starts_, ends_, axes_, with_fp16_);
}
bool SlicePlugin::supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const {
#ifdef SUPPORTS_CUDA_FP16
return ((type == nvinfer1::DataType::kFLOAT ||
type == nvinfer1::DataType::kHALF) &&
(format == nvinfer1::PluginFormat::kNCHW));
#else
return ((type == nvinfer1::DataType::kFLOAT) &&
(format == nvinfer1::PluginFormat::kNCHW));
#endif
if (with_fp16_) {
return ((type == nvinfer1::DataType::kFLOAT ||
type == nvinfer1::DataType::kHALF) &&
(format == nvinfer1::PluginFormat::kNCHW));
} else {
return ((type == nvinfer1::DataType::kFLOAT) &&
(format == nvinfer1::PluginFormat::kNCHW));
}
}
nvinfer1::Dims SlicePlugin::getOutputDimensions(int index,
@ -170,20 +170,17 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
int blocks = (out_num + threads - 1) / threads;
auto input_type = getDataType();
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Slice-->fp32";
const float *input1 = static_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]);
SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
out_num, num_dims, input1, offset_temp_data_, output);
} else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16
VLOG(1) << "TRT Plugin DataType selected. Slice-->fp16";
const half *input1 = static_cast<const half *>(inputs[0]);
half *output = static_cast<half *>(outputs[0]);
SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
out_num, num_dims, input1, offset_temp_data_, output);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else {
PADDLE_THROW(platform::errors::Fatal(
"The Slice TRT Plugin's input type should be float or half."));
@ -194,7 +191,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
size_t SlicePlugin::getSerializationSize() {
return getBaseSerializationSize() + SerializedSize(getPluginType()) +
SerializedSize(starts_) + SerializedSize(ends_) +
SerializedSize(axes_) + SerializedSize(ban_fp16_);
SerializedSize(axes_);
}
void SlicePlugin::serialize(void *buffer) {
@ -203,15 +200,15 @@ void SlicePlugin::serialize(void *buffer) {
SerializeValue(&buffer, starts_);
SerializeValue(&buffer, ends_);
SerializeValue(&buffer, axes_);
SerializeValue(&buffer, ban_fp16_);
}
// Dynamic Plugin below.
#if IS_TRT_VERSION_GE(6000)
SlicePluginDynamic::SlicePluginDynamic(std::vector<int> starts,
std::vector<int> ends,
std::vector<int> axes, bool ban_fp16)
: starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {
std::vector<int> axes, bool with_fp16)
: starts_(starts), ends_(ends), axes_(axes) {
with_fp16_ = with_fp16;
cudaEventCreate(&copy_event_);
cudaStreamCreate(&copy_stream_);
}
@ -221,7 +218,7 @@ SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
DeserializeValue(&serialData, &serialLength, &starts_);
DeserializeValue(&serialData, &serialLength, &ends_);
DeserializeValue(&serialData, &serialLength, &axes_);
DeserializeValue(&serialData, &serialLength, &ban_fp16_);
DeserializeValue(&serialData, &serialLength, &with_fp16_);
cudaEventCreate(&copy_event_);
cudaStreamCreate(&copy_stream_);
}
@ -237,7 +234,7 @@ int SlicePluginDynamic::initialize() { return 0; }
size_t SlicePluginDynamic::getSerializationSize() const {
size_t size = SerializedSize(starts_) + SerializedSize(ends_) +
SerializedSize(axes_) + SerializedSize(ban_fp16_);
SerializedSize(axes_) + SerializedSize(with_fp16_);
return size;
}
@ -246,7 +243,7 @@ void SlicePluginDynamic::serialize(void *buffer) const {
SerializeValue(&buffer, starts_);
SerializeValue(&buffer, ends_);
SerializeValue(&buffer, axes_);
SerializeValue(&buffer, ban_fp16_);
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
@ -278,19 +275,14 @@ bool SlicePluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16
if (ban_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
if (with_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
#else
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#endif
}
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
// output
@ -362,20 +354,17 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
int blocks = (out_num + threads - 1) / threads;
auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Slice-->fp32";
const float *input1 = static_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]);
SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
out_num, num_dims, input1, offset_temp_data_, output);
} else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16
VLOG(1) << "TRT Plugin DataType selected. Slice-->fp16";
const half *input1 = static_cast<const half *>(inputs[0]);
half *output = static_cast<half *>(outputs[0]);
SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
out_num, num_dims, input1, offset_temp_data_, output);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else {
PADDLE_THROW(platform::errors::Fatal(
"The Slice TRT Plugin's input type should be float or half."));

@ -29,7 +29,7 @@ namespace plugin {
class SlicePlugin : public PluginTensorRT {
public:
explicit SlicePlugin(std::vector<int> starts, std::vector<int> ends,
std::vector<int> axes, bool ban_fp16);
std::vector<int> axes, bool with_fp16);
// It was used for tensorrt deserialization.
// It should not be called by users.
@ -58,7 +58,6 @@ class SlicePlugin : public PluginTensorRT {
std::vector<int> starts_;
std::vector<int> ends_;
std::vector<int> axes_;
bool ban_fp16_{false};
int* offset_temp_data_{nullptr};
cudaEvent_t copy_event_;
cudaStream_t copy_stream_;
@ -68,10 +67,10 @@ class SlicePlugin : public PluginTensorRT {
class SlicePluginDynamic : public DynamicPluginTensorRT {
public:
explicit SlicePluginDynamic(std::vector<int> starts, std::vector<int> ends,
std::vector<int> axes, bool ban_fp16);
std::vector<int> axes, bool with_fp16);
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SlicePluginDynamic(starts_, ends_, axes_, ban_fp16_);
return new SlicePluginDynamic(starts_, ends_, axes_, with_fp16_);
}
SlicePluginDynamic(void const* serialData, size_t serialLength);
@ -117,7 +116,6 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
std::vector<int> starts_;
std::vector<int> ends_;
std::vector<int> axes_;
bool ban_fp16_{false};
int* offset_temp_data_{nullptr};
cudaEvent_t copy_event_;
cudaStream_t copy_stream_;

@ -145,9 +145,16 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
#if IS_TRT_VERSION_GE(6000)
int SplitPluginDynamic::initialize() { return 0; }
size_t SplitPluginDynamic::getSerializationSize() const { return 0; }
size_t SplitPluginDynamic::getSerializationSize() const {
return SerializedSize(axis_) + SerializedSize(output_length_) +
SerializedSize(with_fp16_);
}
void SplitPluginDynamic::serialize(void* buffer) const {}
void SplitPluginDynamic::serialize(void* buffer) const {
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, output_length_);
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs SplitPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
@ -183,14 +190,14 @@ bool SplitPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc& in = in_out[pos];
if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#else
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#endif
if (with_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
}
const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
// output
@ -234,6 +241,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Split-->fp32";
thrust::device_vector<float*> d_output_ptrs;
d_output_ptrs.resize(this->getNbOutputs(), nullptr);
@ -249,7 +257,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
inner_cols, axis_shape, outer_rows);
} else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16
VLOG(1) << "TRT Plugin DataType selected. Split-->fp16";
thrust::device_vector<half*> d_output_ptrs;
d_output_ptrs.resize(this->getNbOutputs(), nullptr);
@ -264,10 +272,6 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
inner_cols, axis_shape, outer_rows);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
}
return cudaGetLastError() != cudaSuccess;
}

@ -15,6 +15,7 @@
#pragma once
#include <thrust/device_vector.h>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
@ -27,8 +28,10 @@ namespace plugin {
class SplitPlugin : public PluginTensorRT {
public:
SplitPlugin() {}
SplitPlugin(int axis, std::vector<int> const& output_lengths)
: axis_(axis), same_shape_(true), output_length_(output_lengths) {}
SplitPlugin(int axis, std::vector<int> const& output_lengths, bool with_fp16)
: axis_(axis), same_shape_(true), output_length_(output_lengths) {
with_fp16_ = with_fp16;
}
SplitPlugin(void const* serial_data, size_t serial_length) {
deserializeBase(serial_data, serial_length);
@ -37,7 +40,7 @@ class SplitPlugin : public PluginTensorRT {
}
SplitPlugin* clone() const override {
return new SplitPlugin(axis_, output_length_);
return new SplitPlugin(axis_, output_length_, with_fp16_);
}
const char* getPluginType() const override { return "split_plugin"; }
@ -77,13 +80,20 @@ class SplitPlugin : public PluginTensorRT {
#if IS_TRT_VERSION_GE(6000)
class SplitPluginDynamic : public DynamicPluginTensorRT {
public:
SplitPluginDynamic(int axis, std::vector<int> const& output_lengths)
: axis_(axis), output_length_(output_lengths) {}
SplitPluginDynamic(int axis, std::vector<int> const& output_lengths,
bool with_fp16)
: axis_(axis), output_length_(output_lengths) {
with_fp16_ = with_fp16;
}
SplitPluginDynamic(void const* serial_data, size_t serial_length) {}
SplitPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &output_length_);
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SplitPluginDynamic(axis_, output_length_);
return new SplitPluginDynamic(axis_, output_length_, with_fp16_);
}
const char* getPluginType() const override { return "split_plugin"; }
@ -127,6 +137,46 @@ class SplitPluginDynamic : public DynamicPluginTensorRT {
int axis_;
std::vector<int> output_length_;
};
class SplitPluginV2Creator : public nvinfer1::IPluginCreator {
public:
SplitPluginV2Creator() {}
const char* getPluginName() const override { return "split_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new SplitPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(SplitPluginV2Creator);
#endif
} // namespace plugin

@ -24,19 +24,22 @@ namespace tensorrt {
namespace plugin {
#if IS_TRT_VERSION_GE(6000)
StackPluginDynamic::StackPluginDynamic(int axis, int num_stack)
: axis_(axis), num_stack_(num_stack) {}
StackPluginDynamic::StackPluginDynamic(int axis, int num_stack, bool with_fp16)
: axis_(axis), num_stack_(num_stack) {
with_fp16_ = with_fp16;
}
StackPluginDynamic::StackPluginDynamic(void const* serial_data,
size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &num_stack_);
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
}
StackPluginDynamic::~StackPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const {
return new StackPluginDynamic(axis_, num_stack_);
return new StackPluginDynamic(axis_, num_stack_, with_fp16_);
}
const char* StackPluginDynamic::getPluginType() const { return "stack_plugin"; }
@ -49,12 +52,14 @@ size_t StackPluginDynamic::getSerializationSize() const {
size_t serialize_size = 0;
serialize_size += SerializedSize(axis_);
serialize_size += SerializedSize(num_stack_);
serialize_size += SerializedSize(with_fp16_);
return serialize_size;
}
void StackPluginDynamic::serialize(void* buffer) const {
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, num_stack_);
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
@ -99,14 +104,14 @@ bool StackPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc& in = in_out[pos];
if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#else
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#endif
if (with_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
}
const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
// output
@ -170,20 +175,17 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
auto infer_type = input_desc[0].type;
if (infer_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Stack-->fp32";
float* output = static_cast<float*>(outputs[0]);
StackKernel<float><<<num_blocks, num_threads, 0, stream>>>(
reinterpret_cast<const float* const*>(workspace), output, num_stacks,
base_unit);
} else if (infer_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16
VLOG(1) << "TRT Plugin DataType selected. Stack-->fp16";
__half* output = static_cast<__half*>(outputs[0]);
StackKernel<__half><<<num_blocks, num_threads, 0, stream>>>(
reinterpret_cast<const __half* const*>(workspace), output, num_stacks,
base_unit);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else {
PADDLE_THROW(
platform::errors::Fatal("The Stack TRT Plugin's input type only "
@ -209,6 +211,7 @@ nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) {
int axis = -1;
int num_stack = -1;
bool with_fp16 = false;
for (int i = 0; i < fc->nbFields; ++i) {
const std::string name(fc->fields[i].name);
@ -216,13 +219,15 @@ nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
axis = static_cast<const int*>(fc->fields[i].data)[0];
} else if (name == "num_stack") {
num_stack = static_cast<const int*>(fc->fields[i].data)[0];
} else if (name == "with_fp16") {
with_fp16 = static_cast<const bool*>(fc->fields[i].data)[0];
} else {
PADDLE_THROW(platform::errors::Fatal("Meet an unknown plugin field '" +
name +
"' when creating stack op plugin."));
}
}
return new StackPluginDynamic(axis, num_stack);
return new StackPluginDynamic(axis, num_stack, with_fp16);
}
nvinfer1::IPluginV2* StackPluginDynamicCreator::deserializePlugin(

@ -28,7 +28,7 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000)
class StackPluginDynamic : public DynamicPluginTensorRT {
public:
explicit StackPluginDynamic(int axis, int num_stack);
explicit StackPluginDynamic(int axis, int num_stack, bool with_fp16);
StackPluginDynamic(void const* serial_data, size_t serial_length);
~StackPluginDynamic();
nvinfer1::IPluginV2DynamicExt* clone() const override;

@ -44,12 +44,12 @@ nvinfer1::Dims SwishPlugin::getOutputDimensions(int index,
template <typename T>
__device__ T math_exp(T a);
#ifdef SUPPORTS_CUDA_FP16
template <>
__device__ half math_exp<half>(half a) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
return hexp(a);
}
#endif
}
template <>
__device__ float math_exp<float>(float a) {
@ -71,6 +71,19 @@ __global__ void swish_kernel(int num, const T *input, T *output, T beta) {
}
}
template <>
__global__ void swish_kernel<half>(int num, const half *input, half *output,
half beta) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < num) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
output[index] =
__ldg(input + index) /
(static_cast<half>(1.0) + math_exp<half>(-beta * __ldg(input + index)));
#endif
}
}
int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
void **outputs, void *workspace, cudaStream_t stream) {
// input dims is CHW.
@ -92,14 +105,18 @@ int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
#if IS_TRT_VERSION_GE(6000)
int SwishPluginDynamic::initialize() {
setPluginNamespace("swish");
getPluginNamespace();
return 0;
}
size_t SwishPluginDynamic::getSerializationSize() const { return 0; }
size_t SwishPluginDynamic::getSerializationSize() const {
return SerializedSize(beta_) + SerializedSize(with_fp16_);
}
void SwishPluginDynamic::serialize(void *buffer) const {}
void SwishPluginDynamic::serialize(void *buffer) const {
SerializeValue(&buffer, beta_);
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs SwishPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
@ -123,14 +140,14 @@ bool SwishPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#else
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#endif
if (with_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
}
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
// output
@ -157,20 +174,17 @@ int SwishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Swish-->fp32";
const float *input = static_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]);
swish_kernel<float><<<blocks, threads, 0, stream>>>(num, input, output,
beta_);
} else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16
VLOG(1) << "TRT Plugin DataType selected. Swish-->fp16";
const half *input = static_cast<const half *>(inputs[0]);
half *output = static_cast<half *>(outputs[0]);
swish_kernel<half><<<blocks, threads, 0, stream>>>(
num, input, output, static_cast<half>(beta_));
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"The Swish TRT Plugin's input type should be float or half."));

@ -32,7 +32,8 @@ class SwishPlugin : public PluginTensorRT {
protected:
size_t getSerializationSize() override {
return getBaseSerializationSize() + SerializedSize(beta_);
return SerializedSize(getPluginType()) + getBaseSerializationSize() +
SerializedSize(beta_);
}
// TRT will call this func when we need to serialize the configuration of
@ -45,7 +46,9 @@ class SwishPlugin : public PluginTensorRT {
}
public:
explicit SwishPlugin(const float beta) : beta_(beta) {}
explicit SwishPlugin(const float beta, const bool with_fp16) : beta_(beta) {
with_fp16_ = with_fp16;
}
// It was used for tensorrt deserialization.
// It should not be called by users.
@ -56,7 +59,9 @@ class SwishPlugin : public PluginTensorRT {
~SwishPlugin() {}
int initialize() override;
SwishPlugin* clone() const override { return new SwishPlugin(beta_); }
SwishPlugin* clone() const override {
return new SwishPlugin(beta_, with_fp16_);
}
const char* getPluginType() const override { return "swish_plugin"; }
int getNbOutputs() const override { return 1; }
@ -69,10 +74,16 @@ class SwishPlugin : public PluginTensorRT {
#if IS_TRT_VERSION_GE(6000)
class SwishPluginDynamic : public DynamicPluginTensorRT {
public:
explicit SwishPluginDynamic(const float beta) : beta_(beta) {}
SwishPluginDynamic(void const* serialData, size_t serialLength) {}
explicit SwishPluginDynamic(const float beta, const bool with_fp16)
: beta_(beta) {
with_fp16_ = with_fp16;
}
SwishPluginDynamic(void const* serialData, size_t serialLength) {
DeserializeValue(&serialData, &serialLength, &beta_);
DeserializeValue(&serialData, &serialLength, &with_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SwishPluginDynamic(beta_);
return new SwishPluginDynamic(beta_, with_fp16_);
}
const char* getPluginType() const override { return "swish_plugin"; }
@ -115,6 +126,46 @@ class SwishPluginDynamic : public DynamicPluginTensorRT {
private:
float beta_;
};
class SwishPluginV2Creator : public nvinfer1::IPluginCreator {
public:
SwishPluginV2Creator() {}
const char* getPluginName() const override { return "swish_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new SwishPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(SwishPluginV2Creator);
#endif
} // namespace plugin

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save