Merge pull request #15729 from NHZlX/add_static_model_load_for_trt

Four points for enhancing Paddle-TRT
align_pyramid
Zhaolong Xing 6 years ago committed by GitHub
commit 3d63aa0a11
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
tar -xz -C /usr/local && \
RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
cp -rf /usr/local/TensorRT/include /usr && \
cp -rf /usr/local/TensorRT/lib /usr

@ -14,6 +14,7 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
@ -24,6 +25,10 @@ namespace ir {
static const char kParamScopeAttr[] = "__param_scope__";
static const char kFuseStatisAttr[] = "__fuse_statis__";
// When we use trt or other third_party lib, the parameters are managed by
// the lib, but not the fluid. So we need to record them to avoid duplicate
// allocation.
static const char kRepetitiveParamAttr[] = "__repetitive_param__";
enum FuseOptions {
DO_NOT_FUSE, // fusing will not be done

@ -23,8 +23,12 @@
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
@ -133,6 +137,8 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
AnalysisConfig::Precision);
DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
bool);
// Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);

@ -17,10 +17,12 @@ limitations under the License. */
#include <sys/stat.h>
#include <cstdio>
#include <fstream>
#include <memory>
#include <set>
#include <string>
#include <typeindex>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/framework.pb.h"
@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
return "";
}
static std::string GetTrtEngineSerializedPath(const std::string &model_root,
const std::string &engine_key) {
return model_root + "/trt_serialized_" + engine_key;
}
static std::string GetTrtEngineSerializedData(
const std::string &model_opt_cache_dir, const std::string &engine_key) {
std::string trt_serialized_path =
GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key);
if (FileExists(trt_serialized_path)) {
VLOG(3) << "Trt serialized file: " << trt_serialized_path
<< "is found here";
std::ifstream infile(trt_serialized_path, std::ios::in);
std::stringstream buffer;
buffer << infile.rdbuf();
std::string trt_engine_serialized_data(buffer.str());
return trt_engine_serialized_data;
}
return "";
}
static void SaveTrtEngineSerializedDataToFile(
const std::string &trt_serialized_path,
const std::string &engine_serialized_data) {
std::ofstream outfile(trt_serialized_path);
outfile << engine_serialized_data;
outfile.close();
}
} // namespace analysis
} // namespace inference
} // namespace paddle

@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set(
"model_opt_cache_dir",
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("use_static_engine",
new bool(argument->tensorrt_use_static_engine()));
}
pre_pass = pass_name;

@ -22,7 +22,10 @@
#pragma once
#include <memory>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"

@ -13,7 +13,12 @@
// limitations under the License.
#pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
std::unique_ptr<framework::ir::Graph> graph) const override;
private:
void CreateTensorRTOp(framework::ir::Node *x,
framework::ir::Graph *graph) const;
void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const;
void CleanIntermediateOutputs(framework::ir::Node *node);
};

@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return;
auto &graph = argument->main_graph();
std::vector<std::string> repetitive_params;
if (graph.Has(framework::ir::kRepetitiveParamAttr))
repetitive_params = graph.Get<std::vector<std::string>>(
framework::ir::kRepetitiveParamAttr);
LOG(INFO) << "Sync params from CPU to GPU";
PADDLE_ENFORCE(argument->gpu_device_id_valid());
@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
continue;
}
auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE(var != nullptr);
if (var->IsType<framework::LoDTensor>() ||

@ -17,6 +17,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h"

@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(tensorrt_max_batchsize_);
CP_MEMBER(tensorrt_min_subgraph_size_);
CP_MEMBER(tensorrt_precision_mode_);
CP_MEMBER(trt_use_static_engine_);
// MKLDNN related.
CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_);
@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() {
void AnalysisConfig::EnableTensorRtEngine(
int workspace_size, int max_batch_size, int min_subgraph_size,
AnalysisConfig::Precision precision_mode) {
AnalysisConfig::Precision precision_mode, bool use_static) {
#ifdef PADDLE_WITH_CUDA
if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine(
tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size;
tensorrt_precision_mode_ = precision_mode;
trt_use_static_engine_ = use_static;
Update();
#else

@ -365,6 +365,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
}
if (config_.use_mkldnn_) {
@ -438,12 +439,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
}
feeds_[idx] = op;
feed_names_[op->Output("Out")[0]] = idx;
idx2feeds_[idx] = op->Output("Out")[0];
} else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col"));
if (fetches_.size() <= static_cast<size_t>(idx)) {
fetches_.resize(idx + 1);
}
fetches_[idx] = op;
idx2fetches_[idx] = op->Input("X")[0];
}
}
}
@ -456,6 +459,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
var->GetMutable<framework::FeedFetchList>();
}
std::vector<std::string> AnalysisPredictor::GetInputNames() {
std::vector<std::string> input_names;
for (auto &item : idx2feeds_) {
input_names.push_back(item.second);
}
return input_names;
}
std::vector<std::string> AnalysisPredictor::GetOutputNames() {
std::vector<std::string> output_names;
for (auto &item : idx2fetches_) {
output_names.push_back(item.second);
}
return output_names;
}
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
const std::string &name) {
PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
@ -463,6 +482,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = true;
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else {
auto gpu_place = boost::get<platform::CUDAPlace>(place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
}
@ -473,6 +499,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = false;
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else {
auto gpu_place = boost::get<platform::CUDAPlace>(place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
}

@ -15,12 +15,14 @@
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
#ifdef PADDLE_WITH_TESTING
@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor {
std::vector<PaddleTensor> *output_data,
int batch_size = -1) override;
std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames();
std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string &name) override;
std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor {
std::shared_ptr<framework::ProgramDesc> inference_program_;
std::vector<framework::OpDesc *> feeds_;
std::map<std::string, size_t> feed_names_;
// Sorted according to the idx.
std::map<size_t, std::string> idx2feeds_;
std::vector<framework::OpDesc *> fetches_;
std::map<size_t, std::string> idx2fetches_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them.
std::vector<framework::LoDTensor> feed_tensors_;

@ -15,6 +15,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
return res;
}
template <typename T>
void ZeroCopyTensor::copy_from_cpu(const T *data) {
EAGER_GET_TENSOR;
PADDLE_ENFORCE_GE(
tensor->numel(), 0,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
"function before copy data from cpu.");
size_t ele_size = tensor->numel() * sizeof(T);
if (place_ == PaddlePlace::kCPU) {
auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
std::memcpy(static_cast<void *>(t_data), data, ele_size);
} else {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
platform::CUDAPlace gpu_place(device_);
auto *t_data = tensor->mutable_data<T>(gpu_place);
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
data, ele_size, dev_ctx->stream());
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
}
template <typename T>
void ZeroCopyTensor::copy_to_cpu(T *data) {
EAGER_GET_TENSOR;
auto ele_num = tensor->numel();
auto *t_data = tensor->data<T>();
auto t_place = tensor->place();
if (platform::is_cpu_place(t_place)) {
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
} else {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto gpu_place = boost::get<platform::CUDAPlace>(t_place);
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
t_data, ele_num * sizeof(T), dev_ctx->stream());
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
}
template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
int *size) const;
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const {
return tensor;
}
std::vector<int64_t> ZeroCopyTensor::shape() const {
std::vector<int> ZeroCopyTensor::shape() const {
EAGER_GET_TENSOR;
PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
return framework::vectorize(tensor->dims());
return framework::vectorize2int(tensor->dims());
}
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {

@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const { return nullptr; }
std::vector<int64_t> ZeroCopyTensor::shape() const { return {}; }
std::vector<int> ZeroCopyTensor::shape() const { return {}; }
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}

@ -50,6 +50,11 @@ class Timer {
}
};
static int GetUniqueId() {
static int id = 0;
return id++;
}
static void split(const std::string &str, char sep,
std::vector<std::string> *pieces) {
pieces->clear();

@ -135,7 +135,8 @@ struct AnalysisConfig {
*/
void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3,
Precision precision = Precision::kFloat32);
Precision precision = Precision::kFloat32,
bool use_static = true);
/** A boolean state telling whether the TensorRT engine is used.
*/
bool tensorrt_engine_enabled() const { return use_tensorrt_; }
@ -233,6 +234,7 @@ struct AnalysisConfig {
// subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3};
Precision tensorrt_precision_mode_;
bool trt_use_static_engine_;
// memory reuse related.
bool enable_memory_optim_{false};

@ -160,11 +160,21 @@ class ZeroCopyTensor {
template <typename T>
T* data(PaddlePlace* place, int* size) const;
std::vector<int64_t> shape() const;
template <typename T>
void copy_from_cpu(const T* data);
template <typename T>
void copy_to_cpu(T* data);
std::vector<int> shape() const;
void SetLoD(const std::vector<std::vector<size_t>>& x);
std::vector<std::vector<size_t>> lod() const;
const std::string& name() const { return name_; }
void SetPlace(PaddlePlace place, int device = -1) {
place_ = place;
device_ = device;
}
protected:
explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
@ -179,6 +189,8 @@ class ZeroCopyTensor {
// The corresponding tensor pointer inside Paddle workspace is cached for
// performance.
mutable void* tensor_{nullptr};
PaddlePlace place_;
int device_;
};
/** A simple Inference API for Paddle.
@ -200,6 +212,14 @@ class PaddlePredictor {
std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0;
/** \brief Get input names of the model
*/
virtual std::vector<std::string> GetInputNames() { return {}; }
/** \brief Get output names of the model
*/
virtual std::vector<std::string> GetOutputNames() { return {}; }
/** \brief Get a mutable tensor directly.
*
* NOTE Only works in AnalysisPredictor.

@ -49,11 +49,6 @@ class EngineBase {
// Execute the engine, that will run the inference network.
virtual void Execute(int batch_size) = 0;
// Return the IO buffer that allocated in engine. One can read/write directly
// on the buffer. If the buffer's buffer is nullptr, one can also allocate
// memory and maintain it outside the engine.
virtual Buffer& buffer(const std::string& name) = 0;
virtual ~EngineBase() {}
}; // class EngineBase

@ -18,21 +18,6 @@ namespace paddle {
namespace inference {
namespace tensorrt {
bool to_skip_merging_optimize(TensorRTEngine* engine,
const std::vector<int>& filters,
const std::vector<int>& strides,
const std::vector<int>& paddings,
std::string input_name) {
if (engine->itensor_quote_num[input_name] > 0) {
return true;
}
if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
engine->itensor_quote_num[input_name] += 1;
return false;
}
template <typename RegistFunc, typename SetDilationFunc>
void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode,
@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
weight_tensor->Resize(Y_t->dims());
TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace());
auto* weight_data = weight_tensor->mutable_data<float>(cpu_place);
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
const int n_output = weight_tensor->dims()[0];
@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
layer->getOutput(0)->setName(output_name.c_str());
engine->SetITensor(output_name, layer->getOutput(0));
if (test_mode ||
to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
op_desc.Input("Input").front())) {
if (test_mode) {
engine->DeclareOutput(output_name);
}
}

@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
if (CheckDims(dims_x, dims_y)) {
// The two input tensor should have the same dims
VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
"ElementWisePluginLayer";
plugin::ElementWisePlugin* plugin =
new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
plugin->AddInput(X);
plugin->AddInput(Y);
nvinfer1::IPluginLayer* layer = engine_->AddPlugin(

@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter {
Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)};
static_cast<size_t>(Y_t->numel())};
TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
static_cast<void*>(tmp->data<float>()),
Y_t->memory_size() / sizeof(float));
static_cast<size_t>(Y_t->numel()));
weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
tmp_weight.dims = weight.dims;

@ -16,9 +16,12 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h"
@ -26,6 +29,37 @@ namespace paddle {
namespace inference {
namespace tensorrt {
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
return TRT_DT::kINT32;
default:
return TRT_DT::kINT32;
}
PADDLE_THROW("unkown type");
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
if (shape.size() == 4UL)
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
return nvinfer1::DimsCHW(shape[1], 1, 1);
}
} // namespace // NOLINT
/*
* Convert Op from Fluid to TensorRT Engine.
*/
@ -110,6 +144,34 @@ class OpConverter {
}
}
// The scope here should be inited with the parameter vars.
void ConvertBlockToTRTEngine(
framework::BlockDesc* block_desc, const framework::Scope& scope,
const std::vector<std::string>& inputs,
const std::unordered_set<std::string>& parameters,
const std::vector<std::string>& outputs, TensorRTEngine* engine) {
engine->InitNetwork();
for (auto& input : inputs) {
if (parameters.count(input)) continue;
auto* var = block_desc->FindVar(input);
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
auto var_shape = var->GetShape();
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(var_shape));
}
framework::proto::BlockDesc* block_proto = block_desc->Proto();
ConvertBlock(*block_proto, parameters, scope, engine);
for (auto& output : outputs) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
}
void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
virtual ~OpConverter() {}

@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter {
PADDLE_ENFORCE_NOT_NULL(alpha_var);
auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
platform::CUDAPlace place;
std::unique_ptr<framework::LoDTensor> alpha_tensor_device(
platform::CPUPlace cpu_place;
std::unique_ptr<framework::LoDTensor> alpha_tensor_temp(
new framework::LoDTensor());
alpha_tensor_device->Resize(alpha_tensor->dims());
TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get());
float* alpha_data = alpha_tensor_device->mutable_data<float>(place);
alpha_tensor_temp->Resize(alpha_tensor->dims());
TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get());
float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);
// Transform alpha to TensorRTEngine::Weight
TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
static_cast<void*>(alpha_data),
alpha_tensor_device->numel());
plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
plugin::PReluPlugin* plugin =
new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode);
nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin);
// keep alpha tensor to avoid release it's memory
engine_->weight_map[op_desc.Input("Alpha")[0]] =
std::move(alpha_tensor_device);
std::move(alpha_tensor_temp);
std::string layer_name = "prelu (Output: ";
auto output_name = op_desc.Output("Out")[0];

@ -19,7 +19,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
@ -79,7 +81,8 @@ class TRTConvertValidation {
if_add_batch_(if_add_batch),
max_batch_size_(max_batch_size) {
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_));
engine_.reset(
new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0));
engine_->InitNetwork();
}
@ -114,13 +117,12 @@ class TRTConvertValidation {
}
void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
platform::CUDADeviceContext ctx(place_);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
RandomizeTensor(x_tensor, place_, ctx);
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
@ -146,19 +148,6 @@ class TRTConvertValidation {
// Declare outputs.
op_desc_.reset(new framework::OpDesc(desc, nullptr));
// Set Inputs.
for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue;
auto* var = scope_.FindVar(input);
PADDLE_ENFORCE(var);
auto tensor = var->GetMutable<framework::LoDTensor>();
engine_->SetInputFromGPU(
input, static_cast<void*>(tensor->data<void>()),
sizeof(float) *
analysis::AccuDims(tensor->dims(), tensor->dims().size()));
}
}
// We use the set 'neglected_output' here, because some Ops like batch norm,
@ -168,43 +157,71 @@ class TRTConvertValidation {
std::unordered_set<std::string> neglected_output = {}) {
// Execute Fluid Op
PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
op_->Run(scope_, place);
// Execute TRT.
engine_->Execute(batch_size);
cudaStreamSynchronize(engine_->stream());
platform::CUDADeviceContext ctx(place_);
op_->Run(scope_, place_);
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
const size_t output_space_size = 3000;
std::vector<std::string> input_output_names;
// Note: we need filter the parameter
for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue;
input_output_names.push_back(input);
}
// Collect the fluid outputs.
std::vector<std::vector<float>> fluid_outs;
for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue;
input_output_names.push_back(output);
std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size);
engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
cudaStreamSynchronize(engine_->stream());
auto* var = scope_.FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>();
auto* tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &fluid_out);
fluid_outs.push_back(fluid_out);
}
// Bind input and output for TRT.
const int num_bindings = input_output_names.size();
std::vector<void*> buffers(num_bindings);
for (const std::string& name : input_output_names) {
auto* var = scope_.FindVar(name);
auto* tensor = var->GetMutable<framework::LoDTensor>();
const int bind_index = engine_->engine()->getBindingIndex(name.c_str());
buffers[bind_index] =
static_cast<void*>(tensor->mutable_data<float>(place_));
}
// Execute TRT.
engine_->Execute(batch_size, &buffers, stream_);
size_t fluid_out_size = fluid_out.size();
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
int index = 0;
for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue;
std::vector<float> trt_out;
auto* var = scope_.FindVar(output);
auto* tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &trt_out);
size_t fluid_out_size = fluid_outs[index].size();
if (if_add_batch_ == true) {
fluid_out_size =
batch_size * (framework::product(tensor->dims()) / max_batch_size_);
}
// Compare two output
ASSERT_FALSE(fluid_out.empty());
for (size_t i = 0; i < fluid_out_size; i++) {
// Loose the threshold for CI in different machine model.
EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5);
}
index += 1;
}
}
framework::Scope& scope() { return scope_; }
private:
platform::CUDAPlace place_;
std::unique_ptr<TensorRTEngine> engine_;
cudaStream_t stream_;
std::unique_ptr<framework::OperatorBase> op_;

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save