You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
982 lines
34 KiB
982 lines
34 KiB
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "paddle/fluid/inference/api/analysis_predictor.h"
|
|
#include <glog/logging.h>
|
|
#include <algorithm>
|
|
#include <fstream>
|
|
#include <memory>
|
|
#include <set>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
#include "paddle/fluid/framework/feed_fetch_method.h"
|
|
#include "paddle/fluid/framework/feed_fetch_type.h"
|
|
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
|
|
#include "paddle/fluid/framework/ir/pass.h"
|
|
#include "paddle/fluid/framework/naive_executor.h"
|
|
#include "paddle/fluid/framework/scope.h"
|
|
#include "paddle/fluid/framework/var_type_traits.h"
|
|
#include "paddle/fluid/framework/version.h"
|
|
#include "paddle/fluid/inference/analysis/helper.h"
|
|
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
|
|
#include "paddle/fluid/inference/api/helper.h"
|
|
#include "paddle/fluid/inference/api/paddle_inference_api.h"
|
|
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
|
|
#include "paddle/fluid/inference/utils/singleton.h"
|
|
#include "paddle/fluid/memory/memcpy.h"
|
|
#include "paddle/fluid/platform/cpu_helper.h"
|
|
#ifdef PADDLE_WITH_MKLML
|
|
#include "paddle/fluid/platform/dynload/mklml.h"
|
|
#endif
|
|
#include "paddle/fluid/platform/gpu_info.h"
|
|
#include "paddle/fluid/platform/place.h"
|
|
#include "paddle/fluid/platform/profiler.h"
|
|
|
|
#ifdef PADDLE_WITH_MKLDNN
|
|
#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
|
|
#endif
|
|
|
|
#if PADDLE_WITH_TENSORRT
|
|
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
|
|
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
|
|
#endif
|
|
|
|
namespace paddle {
|
|
|
|
using inference::Singleton;
|
|
#if PADDLE_WITH_TENSORRT
|
|
using inference::tensorrt::TRTInt8Calibrator;
|
|
using inference::tensorrt::TRTCalibratorEngine;
|
|
using inference::tensorrt::TRTCalibratorEngineManager;
|
|
#endif
|
|
|
|
namespace {
|
|
bool IsPersistable(const framework::VarDesc *var) {
|
|
if (var->Persistable() &&
|
|
var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
|
|
var->GetType() != framework::proto::VarType::FETCH_LIST &&
|
|
var->GetType() != framework::proto::VarType::RAW) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
} // namespace
|
|
|
|
bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
|
|
const platform::Place &place) {
|
|
framework::DDim ddim = framework::make_ddim(pt.shape);
|
|
void *input_ptr;
|
|
if (pt.dtype == PaddleDType::INT64) {
|
|
input_ptr = t->mutable_data<int64_t>(ddim, place);
|
|
} else if (pt.dtype == PaddleDType::FLOAT32) {
|
|
input_ptr = t->mutable_data<float>(ddim, place);
|
|
} else if (pt.dtype == PaddleDType::INT32) {
|
|
input_ptr = t->mutable_data<int32_t>(ddim, place);
|
|
} else {
|
|
LOG(ERROR) << "unsupported feed type " << pt.dtype;
|
|
return false;
|
|
}
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(
|
|
input_ptr,
|
|
paddle::platform::errors::Fatal(
|
|
"Cannot convert to LoDTensor because LoDTensor creation failed."));
|
|
PADDLE_ENFORCE_NOT_NULL(
|
|
pt.data.data(),
|
|
paddle::platform::errors::InvalidArgument(
|
|
"The data contained in the input PaddleTensor is illegal."));
|
|
|
|
if (platform::is_cpu_place(place)) {
|
|
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
|
|
std::memcpy(static_cast<void *>(input_ptr), pt.data.data(),
|
|
pt.data.length());
|
|
} else {
|
|
#ifdef PADDLE_WITH_CUDA
|
|
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
|
|
auto *dev_ctx =
|
|
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
|
|
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place);
|
|
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
|
|
platform::CPUPlace(), pt.data.data(), pt.data.length(),
|
|
dev_ctx->stream());
|
|
#else
|
|
PADDLE_THROW(paddle::platform::errors::Fatal(
|
|
"Not compile with CUDA, should not reach here."));
|
|
#endif
|
|
}
|
|
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
|
|
framework::LoD lod;
|
|
for (auto &level : pt.lod) {
|
|
lod.emplace_back(level);
|
|
}
|
|
t->set_lod(lod);
|
|
return true;
|
|
}
|
|
|
|
bool AnalysisPredictor::Init(
|
|
const std::shared_ptr<framework::Scope> &parent_scope,
|
|
const std::shared_ptr<framework::ProgramDesc> &program) {
|
|
VLOG(3) << "Predictor::init()";
|
|
if (config_.with_profile_) {
|
|
LOG(WARNING) << "Profiler is activated, which might affect the performance";
|
|
auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll
|
|
: platform::ProfilerState::kCPU;
|
|
platform::EnableProfiler(tracking_device);
|
|
} else {
|
|
LOG(INFO) << "Profiler is deactivated, and no profiling report will be "
|
|
"generated.";
|
|
}
|
|
|
|
// no matter with or without MKLDNN
|
|
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
|
|
|
|
if (!PrepareScope(parent_scope)) {
|
|
return false;
|
|
}
|
|
if (!CreateExecutor()) {
|
|
return false;
|
|
}
|
|
if (!PrepareProgram(program)) {
|
|
return false;
|
|
}
|
|
|
|
// Prepare executor, create local variables.
|
|
if (!PrepareExecutor()) {
|
|
return true;
|
|
}
|
|
|
|
// Get the feed_target_names and fetch_target_names
|
|
PrepareFeedFetch();
|
|
|
|
return true;
|
|
}
|
|
|
|
bool AnalysisPredictor::PrepareScope(
|
|
const std::shared_ptr<framework::Scope> &parent_scope) {
|
|
if (parent_scope) {
|
|
PADDLE_ENFORCE_NOT_NULL(
|
|
parent_scope,
|
|
"Both program and parent_scope should be set in Clone mode.");
|
|
scope_ = parent_scope;
|
|
status_is_cloned_ = true;
|
|
} else {
|
|
paddle::framework::InitDevices(false);
|
|
scope_.reset(new paddle::framework::Scope());
|
|
status_is_cloned_ = false;
|
|
}
|
|
sub_scope_ = &scope_->NewScope();
|
|
return true;
|
|
}
|
|
bool AnalysisPredictor::PrepareProgram(
|
|
const std::shared_ptr<framework::ProgramDesc> &program) {
|
|
if (!program) {
|
|
if (!LoadProgramDesc()) return false;
|
|
// If not cloned, the parameters should be loaded.
|
|
// If config_.ir_optim() is True, parameters is loaded in
|
|
// OptimizeInferenceProgram(), but other persistable variables
|
|
// (like RAW type var) are not created in scope.
|
|
// If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
|
|
// still need to create other persistable variables.
|
|
// So in both case, create persistable variables at first.
|
|
if (!CheckOperatorCompatible()) {
|
|
LOG(WARNING) << "WARNING: Results may be DIFF! "
|
|
"Please use the corresponding version of the model and "
|
|
"prediction library, and do not use the develop branch.";
|
|
}
|
|
executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
|
|
|
|
// if enable_ir_optim_ is false,
|
|
// the analysis pass(op fuse, graph analysis, trt subgraph, mkldnn etc) will
|
|
// not be executed.
|
|
OptimizeInferenceProgram();
|
|
} else {
|
|
// If the program is passed from external, no need to optimize it, this
|
|
// logic is used in the clone scenario.
|
|
inference_program_ = program;
|
|
}
|
|
|
|
executor_->CreateVariables(*inference_program_, 0, false, sub_scope_);
|
|
|
|
return true;
|
|
}
|
|
bool AnalysisPredictor::CreateExecutor() {
|
|
if (config_.use_gpu_) {
|
|
status_use_gpu_ = true;
|
|
place_ = paddle::platform::CUDAPlace(config_.device_id_);
|
|
} else {
|
|
place_ = paddle::platform::CPUPlace();
|
|
}
|
|
executor_.reset(new paddle::framework::NaiveExecutor(place_));
|
|
return true;
|
|
}
|
|
bool AnalysisPredictor::PrepareExecutor() {
|
|
executor_->Prepare(sub_scope_, *inference_program_, 0,
|
|
config_.use_feed_fetch_ops_);
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(sub_scope_);
|
|
|
|
return true;
|
|
}
|
|
|
|
void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
|
|
#ifdef PADDLE_WITH_MKLDNN
|
|
VLOG(2) << "AnalysisPredictor::Run get_cur_mkldnn_session_id="
|
|
<< platform::get_cur_mkldnn_session_id();
|
|
// In cache clearing mode.
|
|
if (config_.mkldnn_cache_capacity_ > 0) {
|
|
VLOG(2) << "In mkldnn cache clear mode.";
|
|
platform::set_cur_mkldnn_session_id(
|
|
platform::kMKLDNNSessionID_CacheClearing);
|
|
platform::set_cur_input_shape_cache_capacity(
|
|
config_.mkldnn_cache_capacity_);
|
|
// Set current_input_shape for caching dynamic shape.
|
|
std::stringstream ss;
|
|
for (size_t i = 0; i < inputs.size(); ++i) {
|
|
for (size_t j = 0; j < inputs[i].shape.size(); ++j) {
|
|
ss << inputs[i].shape[j] << "-";
|
|
}
|
|
}
|
|
VLOG(2) << "Set input shape=" << ss.str();
|
|
platform::set_cur_input_shape_str(ss.str());
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void AnalysisPredictor::MkldnnPostReset() {
|
|
#ifdef PADDLE_WITH_MKLDNN
|
|
// In cache clearing mode.
|
|
if (config_.mkldnn_cache_capacity_ > 0) {
|
|
paddle::platform::set_cur_mkldnn_session_id(
|
|
platform::kMKLDNNSessionID_Default);
|
|
platform::set_cur_input_shape_cache_capacity(0);
|
|
platform::set_cur_input_shape_str("");
|
|
}
|
|
#endif
|
|
}
|
|
|
|
bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
|
|
std::vector<PaddleTensor> *output_data,
|
|
int batch_size) {
|
|
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
|
|
#ifdef PADDLE_WITH_MKLDNN
|
|
if (config_.use_mkldnn_) MkldnnPreSet(inputs);
|
|
#endif
|
|
VLOG(3) << "Predictor::predict";
|
|
inference::Timer timer;
|
|
timer.tic();
|
|
// set feed variable
|
|
framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
|
|
PADDLE_ENFORCE_NOT_NULL(scope, "The scope should not be nullptr.");
|
|
if (!SetFeed(inputs, scope)) {
|
|
LOG(ERROR) << "fail to set feed";
|
|
return false;
|
|
}
|
|
|
|
// Run the inference program
|
|
// if share variables, we need not create variables
|
|
executor_->Run();
|
|
|
|
// get fetch variable
|
|
if (!GetFetch(output_data, scope)) {
|
|
LOG(ERROR) << "fail to get fetches";
|
|
return false;
|
|
}
|
|
|
|
VLOG(3) << "predict cost: " << timer.toc() << "ms";
|
|
|
|
// All the containers in the scope will be hold in inference, but the
|
|
// operators assume that the container will be reset after each batch.
|
|
// Here is a bugfix, collect all the container variables, and reset then to a
|
|
// bool; the next time, the operator will call MutableData and construct a new
|
|
// container again, so that the container will be empty for each batch.
|
|
if (sub_scope_) {
|
|
tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
|
|
}
|
|
tensor_array_batch_cleaner_.ResetNoTensorVars();
|
|
|
|
// recover the cpu_math_library_num_threads to 1, in order to avoid thread
|
|
// conflict when integrating it into deployment service.
|
|
paddle::platform::SetNumThreads(1);
|
|
#ifdef PADDLE_WITH_MKLDNN
|
|
if (config_.use_mkldnn_) MkldnnPostReset();
|
|
#endif
|
|
#if defined(PADDLE_WITH_MKLML) && defined(_LINUX)
|
|
// Frees unused memory allocated by the Intel® MKL Memory Allocator to
|
|
// avoid memory leak. See:
|
|
// https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
|
|
platform::dynload::MKL_Free_Buffers();
|
|
// We don't support windows since MKL_Free_Buffers is not in
|
|
// mklml_win_2019.0.1.20181227.zip. We will upgrade mklml_win version later.
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
|
|
framework::Scope *scope) {
|
|
VLOG(3) << "Predictor::set_feed";
|
|
if (inputs.size() != feeds_.size()) {
|
|
LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
|
|
<< inputs.size();
|
|
return false;
|
|
}
|
|
|
|
// Cache the inputs memory for better concurrency performance.
|
|
feed_tensors_.resize(inputs.size());
|
|
|
|
for (size_t i = 0; i < inputs.size(); ++i) {
|
|
framework::LoDTensor *input = &feed_tensors_[i];
|
|
if (!PaddleTensorToLoDTensor(inputs[i], input, place_)) {
|
|
return false;
|
|
}
|
|
int idx = -1;
|
|
if (config_.specify_input_name_) {
|
|
auto name = inputs[i].name;
|
|
if (feed_names_.find(name) == feed_names_.end()) {
|
|
LOG(ERROR) << "feed names from program do not have name: [" << name
|
|
<< "] from specified input";
|
|
}
|
|
idx = feed_names_[name];
|
|
} else {
|
|
idx = boost::get<int>(feeds_[i]->GetAttr("col"));
|
|
}
|
|
framework::SetFeedVariable(scope, *input, "feed", idx);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template <typename T>
|
|
void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
|
|
PaddleTensor *output) {
|
|
// set shape.
|
|
auto shape = framework::vectorize(fetch.dims());
|
|
output->shape.assign(shape.begin(), shape.end());
|
|
// set data.
|
|
const T *data = fetch.data<T>();
|
|
int num_elems = inference::VecReduceToInt(shape);
|
|
output->data.Resize(num_elems * sizeof(T));
|
|
// The fetched tensor output by fetch op, should always in CPU memory, so just
|
|
// copy.
|
|
memcpy(output->data.data(), data, num_elems * sizeof(T));
|
|
// set lod
|
|
output->lod.clear();
|
|
for (auto &level : fetch.lod()) {
|
|
output->lod.emplace_back(level.begin(), level.end());
|
|
}
|
|
}
|
|
|
|
bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
|
|
framework::Scope *scope) {
|
|
VLOG(3) << "Predictor::get_fetch";
|
|
outputs->resize(fetches_.size());
|
|
for (size_t i = 0; i < fetches_.size(); ++i) {
|
|
int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
|
|
PADDLE_ENFORCE((size_t)idx == i);
|
|
framework::FetchType &fetch_var =
|
|
framework::GetFetchVariable(*scope, "fetch", idx);
|
|
auto &fetch = boost::get<framework::LoDTensor>(fetch_var);
|
|
auto type = fetch.type();
|
|
auto output = &(outputs->at(i));
|
|
output->name = fetches_[idx]->Input("X")[0];
|
|
if (type == framework::proto::VarType::FP32) {
|
|
GetFetchOne<float>(fetch, output);
|
|
output->dtype = PaddleDType::FLOAT32;
|
|
} else if (type == framework::proto::VarType::INT64) {
|
|
GetFetchOne<int64_t>(fetch, output);
|
|
output->dtype = PaddleDType::INT64;
|
|
} else if (type == framework::proto::VarType::INT32) {
|
|
GetFetchOne<int32_t>(fetch, output);
|
|
output->dtype = PaddleDType::INT32;
|
|
} else {
|
|
LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void AnalysisPredictor::PrepareArgument() {
|
|
argument_.SetUseGPU(config_.use_gpu());
|
|
argument_.SetUseFcPadding(config_.use_fc_padding());
|
|
argument_.SetGPUDeviceId(config_.gpu_device_id());
|
|
argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_);
|
|
argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
|
|
argument_.SetModelFromMemory(config_.model_from_memory_);
|
|
// Analyze inference_program
|
|
argument_.SetPredictorID(predictor_id_);
|
|
argument_.SetOptimCacheDir(config_.opt_cache_dir_);
|
|
if (!config_.model_dir().empty()) {
|
|
argument_.SetModelDir(config_.model_dir());
|
|
} else {
|
|
PADDLE_ENFORCE(
|
|
!config_.params_file().empty(),
|
|
"Either model_dir or (param_file, prog_file) should be set.");
|
|
PADDLE_ENFORCE(!config_.prog_file().empty());
|
|
std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
|
|
|
|
argument_.SetModelProgramPath(config_.prog_file());
|
|
argument_.SetModelParamsPath(config_.params_file());
|
|
}
|
|
|
|
if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
|
|
LOG(INFO) << "TensorRT subgraph engine is enabled";
|
|
argument_.SetUseTensorRT(true);
|
|
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
|
|
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
|
|
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
|
|
argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
|
|
argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
|
|
argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
|
|
argument_.SetMinInputShape(config_.min_input_shape_);
|
|
argument_.SetMaxInputShape(config_.max_input_shape_);
|
|
argument_.SetOptimInputShape(config_.optim_input_shape_);
|
|
argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
|
|
}
|
|
|
|
if (config_.lite_engine_enabled()) {
|
|
argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
|
|
argument_.SetLitePassesFilter(config_.lite_passes_filter_);
|
|
argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
|
|
LOG(INFO) << "Lite subgraph engine is enabled";
|
|
}
|
|
|
|
if (config_.use_mkldnn_) {
|
|
LOG(INFO) << "MKLDNN is enabled";
|
|
argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
|
|
}
|
|
|
|
#ifdef PADDLE_WITH_MKLDNN
|
|
if (config_.mkldnn_quantizer_enabled()) {
|
|
LOG(INFO) << "Quantization is enabled";
|
|
argument_.SetQuantizeEnabledOpTypes(
|
|
config_.mkldnn_quantizer_config()->enabled_op_types());
|
|
argument_.SetQuantizeExcludedOpIds(
|
|
config_.mkldnn_quantizer_config()->excluded_op_ids());
|
|
}
|
|
#endif
|
|
|
|
auto passes = config_.pass_builder()->AllPasses();
|
|
if (!config_.ir_optim()) {
|
|
passes.clear();
|
|
LOG(INFO) << "ir_optim is turned off, no IR pass will be executed";
|
|
}
|
|
argument_.SetDisableLogs(config_.glog_info_disabled());
|
|
argument_.SetIrAnalysisPasses(passes);
|
|
argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
|
|
argument_.SetScopeNotOwned(scope_.get());
|
|
}
|
|
|
|
// NOTE All the members in AnalysisConfig should be copied to Argument.
|
|
void AnalysisPredictor::OptimizeInferenceProgram() {
|
|
PrepareArgument();
|
|
Analyzer().Run(&argument_);
|
|
|
|
PADDLE_ENFORCE(argument_.scope_valid());
|
|
VLOG(5) << "to prepare executor";
|
|
ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
|
|
inference_program_.reset(
|
|
new framework::ProgramDesc(argument_.ir_analyzed_program()));
|
|
// The config and argument take a lot of storage,
|
|
// when the predictor settings are complete, we release these stores.
|
|
argument_.PartiallyRelease();
|
|
config_.PartiallyRelease();
|
|
LOG(INFO) << "======= optimize end =======";
|
|
}
|
|
|
|
template <>
|
|
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
|
|
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
|
|
if (config.glog_info_disabled()) {
|
|
FLAGS_logtostderr = 1;
|
|
FLAGS_minloglevel = 2; // GLOG_ERROR
|
|
}
|
|
VLOG(3) << "create AnalysisConfig";
|
|
PADDLE_ENFORCE(config.is_valid(),
|
|
"Note: Each config can only be used for one predictor.");
|
|
if (config.use_gpu()) {
|
|
// 1. GPU memory
|
|
PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
|
|
PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
|
|
config.gpu_device_id());
|
|
std::vector<std::string> flags;
|
|
|
|
float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
|
|
if (fraction_of_gpu_memory > 0.95f) {
|
|
LOG(ERROR)
|
|
<< "Allocate too much memory for the GPU memory pool, assigned "
|
|
<< config.memory_pool_init_size_mb() << " MB";
|
|
LOG(ERROR)
|
|
<< "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
|
|
}
|
|
|
|
if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
|
|
flags.push_back("dummy");
|
|
std::string flag = "--fraction_of_gpu_memory_to_use=" +
|
|
std::to_string(fraction_of_gpu_memory);
|
|
flags.push_back(flag);
|
|
flags.push_back("--cudnn_deterministic=True");
|
|
VLOG(3) << "set flag: " << flag;
|
|
framework::InitGflags(flags);
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
|
|
// Each config can only be used for one predictor.
|
|
config.SetInValid();
|
|
auto predictor_p = dynamic_cast<AnalysisPredictor *>(predictor.get());
|
|
|
|
if (!predictor_p->Init(nullptr)) {
|
|
return nullptr;
|
|
}
|
|
|
|
if (config.mkldnn_quantizer_enabled() && !predictor_p->MkldnnQuantize()) {
|
|
return nullptr;
|
|
}
|
|
|
|
return predictor;
|
|
}
|
|
|
|
bool AnalysisPredictor::MkldnnQuantize() {
|
|
#if PADDLE_WITH_MKLDNN
|
|
if (!mkldnn_quantizer_)
|
|
mkldnn_quantizer_ = new AnalysisPredictor::MkldnnQuantizer(
|
|
*this, config_.mkldnn_quantizer_config());
|
|
return mkldnn_quantizer_->Quantize();
|
|
#else
|
|
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
void AnalysisPredictor::PrepareFeedFetch() {
|
|
PADDLE_ENFORCE_NOT_NULL(sub_scope_);
|
|
CreateFeedFetchVar(sub_scope_);
|
|
for (auto *op : inference_program_->Block(0).AllOps()) {
|
|
if (op->Type() == "feed") {
|
|
int idx = boost::get<int>(op->GetAttr("col"));
|
|
if (feeds_.size() <= static_cast<size_t>(idx)) {
|
|
feeds_.resize(idx + 1);
|
|
}
|
|
feeds_[idx] = op;
|
|
feed_names_[op->Output("Out")[0]] = idx;
|
|
idx2feeds_[idx] = op->Output("Out")[0];
|
|
} else if (op->Type() == "fetch") {
|
|
int idx = boost::get<int>(op->GetAttr("col"));
|
|
if (fetches_.size() <= static_cast<size_t>(idx)) {
|
|
fetches_.resize(idx + 1);
|
|
}
|
|
fetches_[idx] = op;
|
|
idx2fetches_[idx] = op->Input("X")[0];
|
|
}
|
|
}
|
|
}
|
|
|
|
void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
|
|
PADDLE_ENFORCE_NOT_NULL(scope);
|
|
auto *var = scope->Var("feed");
|
|
var->GetMutable<framework::FeedList>();
|
|
var = scope->Var("fetch");
|
|
var->GetMutable<framework::FetchList>();
|
|
}
|
|
|
|
std::vector<std::string> AnalysisPredictor::GetInputNames() {
|
|
std::vector<std::string> input_names;
|
|
for (auto &item : idx2feeds_) {
|
|
input_names.push_back(item.second);
|
|
}
|
|
return input_names;
|
|
}
|
|
|
|
std::map<std::string, std::vector<int64_t>>
|
|
AnalysisPredictor::GetInputTensorShape() {
|
|
std::map<std::string, std::vector<int64_t>> input_shapes;
|
|
std::vector<std::string> names = GetInputNames();
|
|
for (std::string name : names) {
|
|
auto *var = inference_program_->Block(0).FindVar(name);
|
|
PADDLE_ENFORCE_NOT_NULL(var, "input %s does not exist.", name);
|
|
input_shapes[name] = var->GetShape();
|
|
}
|
|
return input_shapes;
|
|
}
|
|
|
|
std::vector<std::string> AnalysisPredictor::GetOutputNames() {
|
|
std::vector<std::string> output_names;
|
|
for (auto &item : idx2fetches_) {
|
|
output_names.push_back(item.second);
|
|
}
|
|
return output_names;
|
|
}
|
|
|
|
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
|
|
const std::string &name) {
|
|
PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
|
|
std::unique_ptr<ZeroCopyTensor> res(
|
|
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
|
|
res->input_or_output_ = true;
|
|
res->SetName(name);
|
|
if (platform::is_cpu_place(place_)) {
|
|
res->SetPlace(PaddlePlace::kCPU);
|
|
} else {
|
|
auto gpu_place = boost::get<platform::CUDAPlace>(place_);
|
|
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
|
|
const std::string &name) {
|
|
PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
|
|
std::unique_ptr<ZeroCopyTensor> res(
|
|
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
|
|
res->input_or_output_ = false;
|
|
res->SetName(name);
|
|
if (platform::is_cpu_place(place_)) {
|
|
res->SetPlace(PaddlePlace::kCPU);
|
|
} else {
|
|
auto gpu_place = boost::get<platform::CUDAPlace>(place_);
|
|
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
|
|
}
|
|
return res;
|
|
}
|
|
|
|
bool AnalysisPredictor::ZeroCopyRun() {
|
|
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
|
|
executor_->Run();
|
|
// Fix TensorArray reuse not cleaned bug.
|
|
tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
|
|
tensor_array_batch_cleaner_.ResetTensorArray();
|
|
|
|
// recover the cpu_math_library_num_threads to 1, in order to avoid thread
|
|
// conflict when integrating it into deployment service.
|
|
paddle::platform::SetNumThreads(1);
|
|
#if defined(PADDLE_WITH_MKLML) && defined(_LINUX)
|
|
// Frees unused memory allocated by the Intel® MKL Memory Allocator to
|
|
// avoid memory leak. See:
|
|
// https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
|
|
platform::dynload::MKL_Free_Buffers();
|
|
// We don't support windows since MKL_Free_Buffers is not in
|
|
// mklml_win_2019.0.1.20181227.zip. We will upgrade mklml_win version later.
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
bool AnalysisPredictor::LoadProgramDesc() {
|
|
// Initialize the inference program
|
|
std::string filename;
|
|
if (!config_.model_dir().empty()) {
|
|
filename = config_.model_dir() + "/__model__";
|
|
} else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
|
|
// All parameters are saved in a single file.
|
|
// The file names should be consistent with that used
|
|
// in Python API `fluid.io.save_inference_model`.
|
|
filename = config_.prog_file();
|
|
} else {
|
|
if (config_.model_dir().empty() && config_.prog_file().empty()) {
|
|
LOG(ERROR)
|
|
<< "Either model_dir or (prog_file, param_file) should be set.";
|
|
return false;
|
|
}
|
|
LOG(ERROR) << string::Sprintf(
|
|
"not valid model path '%s' or program path '%s'.", config_.model_dir(),
|
|
config_.params_file());
|
|
return false;
|
|
}
|
|
|
|
// Create ProgramDesc
|
|
framework::proto::ProgramDesc proto;
|
|
if (!config_.model_from_memory()) {
|
|
std::string pb_content;
|
|
// Read binary
|
|
std::ifstream fin(filename, std::ios::in | std::ios::binary);
|
|
PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
|
|
filename);
|
|
fin.seekg(0, std::ios::end);
|
|
pb_content.resize(fin.tellg());
|
|
fin.seekg(0, std::ios::beg);
|
|
fin.read(&(pb_content.at(0)), pb_content.size());
|
|
fin.close();
|
|
|
|
proto.ParseFromString(pb_content);
|
|
} else {
|
|
proto.ParseFromString(config_.prog_file());
|
|
}
|
|
inference_program_.reset(new framework::ProgramDesc(proto));
|
|
return true;
|
|
}
|
|
|
|
bool AnalysisPredictor::LoadParameters() {
|
|
PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
|
|
"The inference program should be loaded first.");
|
|
|
|
const auto &global_block = inference_program_->MutableBlock(0);
|
|
|
|
// create a temporary program to load parameters.
|
|
|
|
std::unique_ptr<framework::ProgramDesc> load_program(
|
|
new framework::ProgramDesc());
|
|
framework::BlockDesc *load_block = load_program->MutableBlock(0);
|
|
std::vector<std::string> params;
|
|
|
|
for (auto *var : global_block->AllVars()) {
|
|
if (IsPersistable(var)) {
|
|
VLOG(3) << "persistable variable's name: " << var->Name();
|
|
|
|
framework::VarDesc *new_var = load_block->Var(var->Name());
|
|
new_var->SetShape(var->GetShape());
|
|
new_var->SetDataType(var->GetDataType());
|
|
new_var->SetType(var->GetType());
|
|
new_var->SetLoDLevel(var->GetLoDLevel());
|
|
new_var->SetPersistable(true);
|
|
|
|
if (!config_.params_file().empty()) {
|
|
params.push_back(new_var->Name());
|
|
} else {
|
|
// append_op
|
|
framework::OpDesc *op = load_block->AppendOp();
|
|
op->SetType("load");
|
|
op->SetOutput("Out", {new_var->Name()});
|
|
op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()});
|
|
op->CheckAttrs();
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!config_.params_file().empty()) {
|
|
// sort paramlist to have consistent ordering
|
|
std::sort(params.begin(), params.end());
|
|
// append just the load_combine op
|
|
framework::OpDesc *op = load_block->AppendOp();
|
|
op->SetType("load_combine");
|
|
op->SetOutput("Out", params);
|
|
op->SetAttr("file_path", {config_.params_file()});
|
|
op->CheckAttrs();
|
|
}
|
|
|
|
// Use NaiveExecutor to Load parameters.
|
|
framework::NaiveExecutor e(place_);
|
|
e.Prepare(scope_.get(), *load_program, 0, false);
|
|
e.Run();
|
|
VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load";
|
|
|
|
return true;
|
|
}
|
|
|
|
#if PADDLE_WITH_TENSORRT
|
|
bool AnalysisPredictor::SaveTrtCalibToDisk() {
|
|
PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
|
|
"This func can be invoked only in trt mode");
|
|
auto &block = inference_program_->Block(0);
|
|
for (auto &op_desc : block.AllOps()) {
|
|
if (op_desc->Type() == "tensorrt_engine") {
|
|
std::string engine_name =
|
|
boost::get<std::string>(op_desc->GetAttr("engine_key"));
|
|
if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_name)) {
|
|
LOG(ERROR) << "You should run the predictor(with trt) on the real data "
|
|
"to generate calibration info";
|
|
return false;
|
|
}
|
|
TRTCalibratorEngine *calib_engine =
|
|
Singleton<TRTCalibratorEngineManager>::Global().Get(engine_name);
|
|
LOG(INFO) << "Wait for calib threads done.";
|
|
calib_engine->calib_->waitAndSetDone();
|
|
LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot "
|
|
"of time...";
|
|
calib_engine->thr_->join();
|
|
std::string calibration_table_data =
|
|
calib_engine->calib_->getCalibrationTableAsString();
|
|
|
|
if (calibration_table_data.empty()) {
|
|
LOG(ERROR) << "the calibration table is empty.";
|
|
return false;
|
|
}
|
|
|
|
std::string model_opt_cache_dir =
|
|
argument_.Has("model_dir")
|
|
? argument_.model_dir()
|
|
: inference::analysis::GetDirRoot(argument_.model_program_path());
|
|
|
|
std::string calibration_table_data_path =
|
|
inference::analysis::GetTrtCalibPath(
|
|
inference::analysis::GetOrCreateModelOptCacheDir(
|
|
model_opt_cache_dir),
|
|
engine_name);
|
|
|
|
std::ofstream ofile(calibration_table_data_path, std::ios::out);
|
|
LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file "
|
|
<< calibration_table_data_path;
|
|
ofile << calibration_table_data;
|
|
ofile.close();
|
|
}
|
|
}
|
|
// Free all calibrator resources.
|
|
Singleton<TRTCalibratorEngineManager>::Global().DeleteALL();
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
AnalysisPredictor::~AnalysisPredictor() {
|
|
#if PADDLE_WITH_TENSORRT
|
|
if (config_.tensorrt_engine_enabled() &&
|
|
config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
|
|
Singleton<TRTCalibratorEngineManager>::Global().Has()) {
|
|
SaveTrtCalibToDisk();
|
|
}
|
|
#endif
|
|
if (config_.with_profile_) {
|
|
platform::DisableProfiler(platform::EventSortingKey::kTotal,
|
|
"./profile.log");
|
|
}
|
|
if (sub_scope_) {
|
|
scope_->DeleteScope(sub_scope_);
|
|
}
|
|
|
|
#if PADDLE_WITH_MKLDNN
|
|
if (mkldnn_quantizer_) {
|
|
delete mkldnn_quantizer_;
|
|
mkldnn_quantizer_ = nullptr;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
|
|
std::lock_guard<std::mutex> lk(clone_mutex_);
|
|
auto *x = new AnalysisPredictor(config_);
|
|
x->Init(scope_, inference_program_);
|
|
return std::unique_ptr<PaddlePredictor>(x);
|
|
}
|
|
|
|
std::string AnalysisPredictor::GetSerializedProgram() const {
|
|
return inference_program_->Proto()->SerializeAsString();
|
|
}
|
|
|
|
bool AnalysisPredictor::CheckOperatorCompatible() {
|
|
if (!inference_program_) {
|
|
LOG(FATAL) << "Inference program version check failed because the program "
|
|
"does not exist.";
|
|
return false;
|
|
}
|
|
bool res = true;
|
|
op_compatible_map_.ReadFromProto(*inference_program_->OpCompatibleMap());
|
|
const auto &version = framework::DumpVersion(framework::kCurProgramVersion);
|
|
LOG(INFO) << "MODEL VERSION: "
|
|
<< framework::DumpVersion(inference_program_->Version());
|
|
LOG(INFO) << "PREDICTOR VERSION: " << version;
|
|
std::set<std::string> op_types;
|
|
for (size_t i = 0; i < inference_program_->Size(); ++i) {
|
|
const auto &block = inference_program_->Block(i);
|
|
for (const auto *op : block.AllOps()) {
|
|
op_types.insert(op->Type());
|
|
}
|
|
}
|
|
for (const auto type : op_types) {
|
|
auto compatible_type =
|
|
op_compatible_map_.IsRequireMiniVersion(type, version);
|
|
if (compatible_type != framework::OpCompatibleType::compatible) {
|
|
if (!framework::kCurProgramVersion) {
|
|
LOG(WARNING) << " - Version incompatible ("
|
|
<< static_cast<int>(compatible_type) << ") " << type;
|
|
}
|
|
res = false;
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
// Add SaveOptimModel
|
|
void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
|
|
// save model
|
|
std::string model_name = dir + "/model";
|
|
std::ofstream outfile;
|
|
outfile.open(model_name, std::ios::out | std::ios::binary);
|
|
std::string inference_prog_desc = GetSerializedProgram();
|
|
outfile << inference_prog_desc;
|
|
// save params
|
|
framework::ProgramDesc save_program;
|
|
auto *save_block = save_program.MutableBlock(0);
|
|
|
|
const framework::ProgramDesc &main_program = program();
|
|
const framework::BlockDesc &global_block = main_program.Block(0);
|
|
std::vector<std::string> save_var_list;
|
|
for (framework::VarDesc *var : global_block.AllVars()) {
|
|
if (IsPersistable(var)) {
|
|
framework::VarDesc *new_var = save_block->Var(var->Name());
|
|
new_var->SetShape(var->GetShape());
|
|
new_var->SetDataType(var->GetDataType());
|
|
new_var->SetType(var->GetType());
|
|
new_var->SetLoDLevel(var->GetLoDLevel());
|
|
new_var->SetPersistable(true);
|
|
|
|
save_var_list.push_back(new_var->Name());
|
|
}
|
|
}
|
|
std::sort(save_var_list.begin(), save_var_list.end());
|
|
auto *op = save_block->AppendOp();
|
|
op->SetType("save_combine");
|
|
op->SetInput("X", save_var_list);
|
|
op->SetAttr("file_path", dir + "/params");
|
|
op->CheckAttrs();
|
|
|
|
platform::CPUPlace place;
|
|
framework::Executor exe(place);
|
|
exe.Run(save_program, scope(), 0, true, true);
|
|
}
|
|
|
|
template <>
|
|
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
|
|
const AnalysisConfig &config) {
|
|
return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
|
|
config);
|
|
}
|
|
|
|
} // namespace paddle
|
|
|
|
#if PADDLE_WITH_TENSORRT
|
|
USE_TRT_CONVERTER(elementwise_add_weight);
|
|
USE_TRT_CONVERTER(elementwise_add_tensor);
|
|
USE_TRT_CONVERTER(elementwise_sub_tensor);
|
|
USE_TRT_CONVERTER(elementwise_div_tensor);
|
|
USE_TRT_CONVERTER(elementwise_mul_tensor);
|
|
USE_TRT_CONVERTER(elementwise_max_tensor);
|
|
USE_TRT_CONVERTER(elementwise_min_tensor);
|
|
USE_TRT_CONVERTER(elementwise_pow_tensor);
|
|
USE_TRT_CONVERTER(mul);
|
|
USE_TRT_CONVERTER(conv2d);
|
|
USE_TRT_CONVERTER(relu);
|
|
USE_TRT_CONVERTER(sigmoid);
|
|
USE_TRT_CONVERTER(tanh);
|
|
USE_TRT_CONVERTER(fc);
|
|
USE_TRT_CONVERTER(pool2d);
|
|
USE_TRT_CONVERTER(softmax);
|
|
USE_TRT_CONVERTER(batch_norm);
|
|
USE_TRT_CONVERTER(concat);
|
|
USE_TRT_CONVERTER(dropout);
|
|
USE_TRT_CONVERTER(pad);
|
|
USE_TRT_CONVERTER(hard_sigmoid);
|
|
USE_TRT_CONVERTER(hard_swish);
|
|
USE_TRT_CONVERTER(split);
|
|
USE_TRT_CONVERTER(prelu);
|
|
USE_TRT_CONVERTER(conv2d_transpose);
|
|
USE_TRT_CONVERTER(leaky_relu);
|
|
USE_TRT_CONVERTER(shuffle_channel);
|
|
USE_TRT_CONVERTER(swish);
|
|
USE_TRT_CONVERTER(instance_norm);
|
|
USE_TRT_CONVERTER(layer_norm);
|
|
USE_TRT_CONVERTER(gelu);
|
|
USE_TRT_CONVERTER(multihead_matmul);
|
|
USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
|
|
USE_TRT_CONVERTER(skip_layernorm);
|
|
USE_TRT_CONVERTER(slice);
|
|
USE_TRT_CONVERTER(scale);
|
|
#endif
|