Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_pyramid_dnn_support

test=develop
revert-15207-remove_op_handle_lock_and_fix_var
minqiyang 6 years ago
commit 68a07328fa

@ -134,6 +134,7 @@ if(WITH_GPU)
message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
endif()
add_definitions(-DWITH_ANAKIN)
endif()
if(WITH_ANAKIN)
# NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR

@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_GIT_TAG "v0.10.1")
SET(NGRAPH_GIT_TAG "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)

@ -18,7 +18,7 @@ limitations under the License. */
#include <memory>
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
@ -86,10 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
if (strategy.memory_optimize_) {
auto analysis_var_pass = AppendPass("analysis_var_pass");
}
// Convert graph to run on multi-devices.
auto multi_devices_pass = AppendPass("multi_devices_pass");
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
AppendMultiDevPass(strategy);
// Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) {
@ -115,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
}
}
// Convert graph to run on multi-devices.
void AppendMultiDevPass(const BuildStrategy &strategy) {
ir::Pass *multi_devices_pass;
if (strategy_.is_distribution_) {
multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
} else {
if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
multi_devices_pass =
AppendPass("allreduce_mode_multi_devices_pass").get();
} else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
} else {
PADDLE_THROW("Unknown reduce strategy.");
}
}
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
}
private:
BuildStrategy strategy_;
};
@ -131,6 +148,10 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
return pass_builder_;
}
bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0;
}
std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
@ -145,22 +166,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
if (pass->Type() == "multi_devices_pass") {
pass->Erase("places");
pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
pass->Erase("loss_var_name");
pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
pass->Erase("local_scopes");
pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
if (IsMultiDevPass(pass->Type())) {
pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase(kLossVarName);
pass->SetNotOwned<const std::string>(kLossVarName, &loss_var_name);
pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes);
pass->Erase("nranks");
pass->Set<size_t>("nranks", new size_t(nranks));
pass->Erase(kNRanks);
pass->Set<size_t>(kNRanks, new size_t(nranks));
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
} else if (pass->Type() == "analysis_var_pass") {
const std::vector<OpDesc *> *all_op_descs =
new std::vector<OpDesc *>(main_program.Block(0).AllOps());
@ -201,7 +223,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass);
USE_PASS(multi_batch_merge_pass);
USE_PASS(multi_devices_pass);
USE_PASS(reduce_mode_multi_devices_pass);
USE_PASS(allreduce_mode_multi_devices_pass);
USE_PASS(dist_multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
USE_PASS(analysis_var_pass);

@ -74,8 +74,6 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false};
bool enable_data_balance_{false};
bool memory_optimize_{false};
bool memory_early_delete_{false};
@ -84,6 +82,10 @@ struct BuildStrategy {
bool fuse_broadcast_op_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model.
bool is_distribution_{false};
int num_trainers_{1};
int trainer_id_{0};
std::vector<std::string> trainers_endpoints_;
@ -104,6 +106,8 @@ struct BuildStrategy {
bool IsFinalized() const { return is_finalized_; }
bool IsMultiDevPass(const std::string &pass_name) const;
// Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,

@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include <string>
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
@ -21,68 +21,78 @@ namespace paddle {
namespace framework {
namespace details {
bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
std::unordered_set<VarHandleBase *> ready_vars;
std::unordered_set<OpHandleBase *> ready_ops;
class SSAGraghBuilderWithChecker : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
PADDLE_ENFORCE(IsValidGraph(graph.get()));
return graph;
}
auto insert_pending_var = [&](VarHandleBase *var) {
pending_vars.insert(var);
if (var->GeneratedOp() == nullptr) {
ready_vars.emplace(var);
}
};
bool IsValidGraph(const ir::Graph *graph) const {
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
std::unordered_set<VarHandleBase *> ready_vars;
std::unordered_set<OpHandleBase *> ready_ops;
for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
insert_pending_var(version_pair);
auto insert_pending_var = [&](VarHandleBase *var) {
pending_vars.insert(var);
if (var->GeneratedOp() == nullptr) {
ready_vars.emplace(var);
}
}
}
};
for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
insert_pending_var(var);
}
for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
insert_pending_var(version_pair);
}
}
}
for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
if (op->Inputs().empty()) {
ready_ops.insert(op);
} else {
pending_ops.insert({op, op->NoDupInputSize()});
for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
insert_pending_var(var);
}
}
auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
for (auto *op : set) {
for (auto out : op->Outputs()) {
ready_vars.emplace(out);
for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
if (op->Inputs().empty()) {
ready_ops.insert(op);
} else {
pending_ops.insert({op, op->NoDupInputSize()});
}
}
set.clear();
};
while (!pending_vars.empty()) {
run_all_ops(ready_ops);
auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
for (auto *op : set) {
for (auto out : op->Outputs()) {
ready_vars.emplace(out);
}
}
set.clear();
};
if (ready_vars.empty()) {
return false;
}
while (!pending_vars.empty()) {
run_all_ops(ready_ops);
for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var);
for (auto *op : ready_var->PendingOps()) {
auto &deps = --pending_ops[op];
if (deps == 0) {
ready_ops.insert(op);
if (ready_vars.empty()) {
return false;
}
for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var);
for (auto *op : ready_var->PendingOps()) {
auto &deps = --pending_ops[op];
if (deps == 0) {
ready_ops.insert(op);
}
}
}
ready_vars.clear();
}
ready_vars.clear();
return true;
}
return true;
}
};
} // namespace details
} // namespace framework
} // namespace paddle

@ -1,38 +0,0 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include <string>
namespace paddle {
namespace framework {
namespace details {
class SSAGraghBuilderWithChecker : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
PADDLE_ENFORCE(IsValidGraph(graph.get()));
return graph;
}
bool IsValidGraph(const ir::Graph* graph) const;
};
} // namespace details
} // namespace framework
} // namespace paddle

File diff suppressed because it is too large Load Diff

@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <string>
#include <utility>
#include <vector>
@ -30,78 +31,70 @@ namespace framework {
class Scope;
namespace details {
class MultiDevSSAGraphBuilder : public ir::Pass {
constexpr char kLossVarName[] = "loss_var_name";
constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kStrategy[] = "strategy";
constexpr char kNRanks[] = "nranks";
class MultiDevSSAGraphBuilderBase : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
private:
void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
size_t device_id) const;
void Init() const;
virtual void Init() const;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
mutable platform::NCCLContextMap *nccl_ctxs_;
#endif
virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
int GetVarDeviceID(
const std::string &varname,
const std::unordered_map<std::string, int> &sharded_var_device) const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const = 0;
bool IsScaleLossOp(ir::Node *node) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
virtual void InsertPostprocessOps(ir::Graph *result) const = 0;
int CreateRPCOp(
ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const;
int CreateDistTrainOp(
ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const;
bool UseGPU() const;
bool NeedCollectiveOps() const;
bool IsScaleLossOp(ir::Node *node) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const;
void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name,
ir::Node *out_var_node,
ir::Node *out_var_node, size_t loss_scale,
proto::VarType::Type dtype) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const;
void CreateComputationalOp(ir::Graph *result, ir::Node *node,
int dev_id) const;
int GetOpDeviceID(
ir::Node *node,
const std::unordered_map<std::string, int> &sharded_var_device) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
bool IsSparseGradient(const std::string &og) const;
void InsertDataBalanceOp(ir::Graph *result,
const std::vector<std::string> &datas) const;
void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;
void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
size_t src_dev_id) const;
void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const;
void CreateFusedBroadcastOp(
ir::Graph *result,
const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;
bool IsSparseGradient(const std::string &og) const;
size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const;
std::vector<ir::Node *> SortForReduceMode(
const std::vector<ir::Node *> &) const;
void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
size_t device_id) const;
int GetOpDeviceID(
ir::Node *node,
const std::unordered_map<std::string, int> &shared_var_device,
std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
const;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
mutable platform::NCCLContextMap *nccl_ctxs_;
#endif
mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_;
@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
mutable BuildStrategy strategy_;
mutable std::unordered_map<std::string, VarDesc *> all_vars_;
};
class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
protected:
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
return false;
}
virtual void InsertPostprocessOps(ir::Graph *result) const {}
};
class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
protected:
int GetVarDeviceID(const std::string &varname) const;
int GetOpDeviceID(ir::Node *node) const;
size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
virtual void ResetState() const;
mutable std::unordered_map<std::string, int> sharded_var_device_;
mutable std::vector<int64_t> balance_vars_;
};
class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
protected:
virtual void Init() const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
virtual void InsertPostprocessOps(ir::Graph *result) const;
virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
virtual void ResetState() const;
int GetOpDeviceID(ir::Node *node,
std::unordered_map<std::string, std::vector<ir::Node *>>
*delay_ops) const;
std::vector<ir::Node *> SortForReduceMode(
const std::vector<ir::Node *> &topo_ops) const;
mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
};
class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
protected:
virtual void Init() const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
virtual void InsertPostprocessOps(ir::Graph *result) const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual void ResetState() const;
int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
mutable bool need_broadcast_var_{false};
};
std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
} // namespace details
} // namespace framework
} // namespace paddle

@ -40,14 +40,14 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
void NaiveExecutor::Run() {
#ifndef PADDLE_ON_INFERENCE
LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
"cmake flag ON_INFER is not set.";
LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
"variables will be reused to save the allocation "
"overhead.";
LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
"setting the cmake flag ON_INFER=ON if you are "
"running Paddle Inference";
LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
"cmake flag ON_INFER is not set.";
LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
"variables will be reused to save the allocation "
"overhead.";
LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
"setting the cmake flag ON_INFER=ON if you are "
"running Paddle Inference";
#endif // PADDLE_ON_INFERENCE
for (auto &op : ops_) {
VLOG(3) << std::this_thread::get_id() << " run " << op->Type()

@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
}
}
backend_->call(ngraph_function_, t_out, t_in);
backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
} // NgraphEngine::RunImpl
} // namespace framework
} // namespace paddle

@ -123,8 +123,6 @@ struct Argument {
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
std::function<bool(const framework::ir::Node*)>);
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);

@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
for (const std::string &pass_name : passes) {
auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
// Set some pass attributes.
if (pass_name == "ir_analysis_pass") {
pass->Set("tensorrt_node_teller",
new SubgraphDetector::NodeInsideSubgraphTeller(
argument->tensorrt_node_teller()));
}
if (pass_name == "graph_viz_pass") {
std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
(pre_pass.empty() ? "origin" : pre_pass) +
@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
}
if (pass_name == "tensorrt_subgraph_pass") {
PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
pass->SetNotOwned("tensorrt_node_teller",
argument->tensorrt_node_teller_ptr());
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size",

@ -1,9 +1,13 @@
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
if (TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
endif()

@ -20,6 +20,7 @@
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace paddle {
namespace inference {
@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
auto teller =
Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
auto teller = [](const framework::ir::Node *node) {
if (!node->IsOp() || !node->Op()) return false;
return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
};
SubGraphFuser fuser(graph.get(), teller,
Get<int>("min_subgraph_size") /*min subgraph size*/);
@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(
REGISTER_PASS(tensorrt_subgraph_pass,
paddle::inference::analysis::TensorRtSubgraphPass)
.RequirePassAttr("tensorrt_node_teller")
.RequirePassAttr("max_batch_size")
.RequirePassAttr("workspace_size")
.RequirePassAttr("min_subgraph_size");

@ -27,9 +27,6 @@ namespace analysis {
void IrAnalysisComposePass::RunImpl(Argument *argument) {
ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
InitTensorRTAttrs(argument);
}
ApplyIrPasses(argument);
CollectFusionStatis(argument);
}
@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
return "ir-analysis-compose-pass";
}
void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
LOG(INFO) << "Initing TensorRT pass";
argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) {
std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose", "leaky_relu"});
if (!node->IsOp()) return false;
if (teller_set.count(node->Op()->Type())) {
return true;
} else {
return false;
}
});
}
}
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
std::vector<std::string> passes({
"ir_graph_build_pass", "ir_analysis_pass",

@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
std::string repr() const override;
private:
void InitTensorRTAttrs(Argument* argument);
void ApplyIrPasses(Argument* argument);
void CollectFusionStatis(Argument* argument);

File diff suppressed because it is too large Load Diff

@ -33,6 +33,7 @@
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile);
@ -59,8 +60,8 @@ bool AnalysisPredictor::Init(
if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance";
LOG(INFO) << "You can turn off by set gflags '-profile false'";
auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
: platform::ProfilerState::kCPU;
auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll
: platform::ProfilerState::kCPU;
platform::EnableProfiler(tracking_device);
}
@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram(
// Optimize the program, and load parameters and modify them in the
// scope_.
// This will change the scope_ address.
if (config_.enable_ir_optim) {
if (config_.ir_optim()) {
status_ir_optim_enabled_ = true;
OptimizeInferenceProgram();
} else {
@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram(
return true;
}
bool AnalysisPredictor::CreateExecutor() {
if (config_.use_gpu) {
if (config_.use_gpu_) {
status_use_gpu_ = true;
place_ = paddle::platform::CUDAPlace(config_.device);
place_ = paddle::platform::CUDAPlace(config_.device_id_);
} else {
place_ = paddle::platform::CPUPlace();
}
@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() {
}
bool AnalysisPredictor::PrepareExecutor() {
executor_->Prepare(sub_scope_, *inference_program_, 0,
config_.use_feed_fetch_ops);
config_.use_feed_fetch_ops_);
PADDLE_ENFORCE_NOT_NULL(sub_scope_);
@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
}
input.set_lod(lod);
int idx = -1;
if (config_.specify_input_name) {
if (config_.specify_input_name_) {
auto name = inputs[i].name;
if (feed_names_.find(name) == feed_names_.end()) {
LOG(ERROR) << "feed names from program do not have name: [" << name
@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
void AnalysisPredictor::OptimizeInferenceProgram() {
status_program_optimized_ = true;
argument_.SetUseGPU(config_.use_gpu);
argument_.SetGPUDeviceId(config_.device);
argument_.SetUseGPU(config_.use_gpu());
argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program
if (!config_.model_dir.empty()) {
argument_.SetModelDir(config_.model_dir);
if (!config_.model_dir().empty()) {
argument_.SetModelDir(config_.model_dir());
} else {
PADDLE_ENFORCE(
!config_.param_file.empty(),
!config_.params_file().empty(),
"Either model_dir or (param_file, prog_file) should be set.");
PADDLE_ENFORCE(!config_.prog_file.empty());
argument_.SetModelProgramPath(config_.prog_file);
argument_.SetModelParamsPath(config_.param_file);
PADDLE_ENFORCE(!config_.prog_file().empty());
argument_.SetModelProgramPath(config_.prog_file());
argument_.SetModelParamsPath(config_.params_file());
}
if (config_.use_gpu && config_.use_tensorrt_) {
if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
argument_.SetUseTensorRT(true);
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
auto passes = config_.pass_builder()->AllPasses();
if (!config_.enable_ir_optim) passes.clear();
if (!config_.ir_optim()) passes.clear();
argument_.SetIrAnalysisPasses(passes);
argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
Analyzer().Run(&argument_);
@ -358,18 +359,26 @@ template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) {
if (config.use_gpu()) {
// 1. GPU memeroy
PADDLE_ENFORCE_GT(
config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
config.gpu_device_id());
std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) {
float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
if (fraction_of_gpu_memory > 0.95f) {
LOG(ERROR)
<< "Allocate too much memory for the GPU memory pool, assigned "
<< config.memory_pool_init_size_mb() << " MB";
LOG(ERROR)
<< "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
}
if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(config.fraction_of_gpu_memory);
std::to_string(fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags);
@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() {
bool AnalysisPredictor::LoadProgramDesc() {
// Initialize the inference program
std::string filename;
if (!config_.model_dir.empty()) {
filename = config_.model_dir + "/__model__";
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
if (!config_.model_dir().empty()) {
filename = config_.model_dir() + "/__model__";
} else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
filename = config_.prog_file;
filename = config_.prog_file();
} else {
if (config_.model_dir.empty() && config_.prog_file.empty()) {
if (config_.model_dir().empty() && config_.prog_file().empty()) {
LOG(ERROR)
<< "Either model_dir or (prog_file, param_file) should be set.";
return false;
}
LOG(ERROR) << string::Sprintf(
"not valid model path '%s' or program path '%s'.", config_.model_dir,
config_.param_file);
"not valid model path '%s' or program path '%s'.", config_.model_dir(),
config_.params_file());
return false;
}
@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
proto.ParseFromString(pb_content);
} else {
proto.ParseFromString(config_.prog_file);
proto.ParseFromString(config_.prog_file());
}
inference_program_.reset(new framework::ProgramDesc(proto));
return true;
@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() {
new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true);
if (!config_.param_file.empty()) {
if (!config_.params_file().empty()) {
params.push_back(new_var->Name());
} else {
// append_op
framework::OpDesc *op = load_block->AppendOp();
op->SetType("load");
op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()});
op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()});
op->CheckAttrs();
}
}
}
if (!config_.param_file.empty()) {
if (!config_.params_file().empty()) {
// sort paramlist to have consistent ordering
std::sort(params.begin(), params.end());
// append just the load_combine op
framework::OpDesc *op = load_block->AppendOp();
op->SetType("load_combine");
op->SetOutput("Out", params);
op->SetAttr("file_path", {config_.param_file});
op->SetAttr("file_path", {config_.params_file()});
op->CheckAttrs();
}

@ -25,9 +25,9 @@ namespace paddle {
using contrib::AnalysisConfig;
TEST(AnalysisPredictor, analysis_off) {
AnalysisConfig config(false);
config.model_dir = FLAGS_dirname;
config.enable_ir_optim = false;
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(false);
auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) {
}
TEST(AnalysisPredictor, analysis_on) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(true);
#ifdef PADDLE_WITH_CUDA
AnalysisConfig config(true);
config.fraction_of_gpu_memory = 0.15;
config.EnableUseGpu(100, 0);
#else
AnalysisConfig config;
config.DisableGpu();
#endif
config.model_dir = FLAGS_dirname;
config.enable_ir_optim = true;
auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) {
}
// compare with NativePredictor
auto naive_predictor = CreatePaddlePredictor<NativeConfig>(config);
auto naive_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
std::vector<PaddleTensor> naive_outputs;
ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs));
ASSERT_EQ(naive_outputs.size(), 1UL);
@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) {
TEST(AnalysisPredictor, ZeroCopy) {
AnalysisConfig config;
config.model_dir = FLAGS_dirname;
config.use_feed_fetch_ops = false;
config.SetModel(FLAGS_dirname);
config.SwitchUseFeedFetchOps(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto w0 = predictor->GetInputTensor("firstw");
@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) {
TEST(AnalysisPredictor, Clone) {
AnalysisConfig config;
config.model_dir = FLAGS_dirname;
config.use_feed_fetch_ops = true;
config.enable_ir_optim = true;
config.SetModel(FLAGS_dirname);
config.SwitchUseFeedFetchOps(true);
config.SwitchIrOptim(true);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor(config));

@ -19,8 +19,6 @@ limitations under the License. */
#pragma once
#define WITH_ANAKIN
#include <vector>
#include "framework/core/net/net.h"

@ -288,7 +288,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) {
// 1. GPU memeroy
PADDLE_ENFORCE_GT(
PADDLE_ENFORCE_GE(
config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);

@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) {
#endif
TEST(PassBuilder, Delete) {
contrib::AnalysisConfig config(false);
contrib::AnalysisConfig config;
config.DisableGpu();
config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
const auto& passes = config.pass_builder()->AllPasses();
auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass");

@ -36,12 +36,11 @@ namespace demo {
*/
void Main() {
std::unique_ptr<PaddlePredictor> predictor;
paddle::contrib::AnalysisConfig config(true);
config.param_file = FLAGS_modeldir + "/__params__";
config.prog_file = FLAGS_modeldir + "/__model__";
config.device = 0;
paddle::contrib::AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_modeldir + "/__params__",
FLAGS_modeldir + "/__model__");
config.EnableTensorRtEngine();
config.fraction_of_gpu_memory = 0.1; // set by yourself
predictor = CreatePaddlePredictor(config);
VLOG(3) << "begin to process data";

@ -40,15 +40,14 @@ using contrib::AnalysisConfig;
*/
void Main(bool use_gpu) {
std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
AnalysisConfig config(use_gpu);
config.param_file = FLAGS_modeldir + "/__params__";
config.prog_file = FLAGS_modeldir + "/__model__";
config.device = 0;
if (FLAGS_use_gpu) {
config.fraction_of_gpu_memory = 0.1; // set by yourself
AnalysisConfig config;
if (use_gpu) {
config.EnableUseGpu(100, 0);
}
config.SetModel(FLAGS_modeldir + "/__model__",
FLAGS_modeldir + "/__params__");
predictor = CreatePaddlePredictor<NativeConfig>(config);
predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
analysis_predictor = CreatePaddlePredictor(config);
// Just a single batch of data.

@ -34,26 +34,67 @@ class AnalysisPredictor;
namespace contrib {
// NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig {
explicit AnalysisConfig(bool use_gpu = false);
struct AnalysisConfig {
AnalysisConfig() = default;
explicit AnalysisConfig(const AnalysisConfig& other);
explicit AnalysisConfig(AnalysisConfig&& other);
explicit AnalysisConfig(const std::string& model_dir);
explicit AnalysisConfig(const std::string& prog_file,
const std::string& params_file);
// Model path related.
void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
void SetModel(const std::string& prog_file_path,
const std::string& params_file_path);
void SetProgFile(const std::string& x) { prog_file_ = x; }
void SetParamsFile(const std::string& x) { params_file_ = x; }
const std::string& model_dir() const { return model_dir_; }
const std::string& prog_file() const { return prog_file_; }
const std::string& params_file() const { return params_file_; }
// GPU related.
void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
void DisableGpu();
bool use_gpu() const { return use_gpu_; }
int gpu_device_id() const { return device_id_; }
int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
float fraction_of_gpu_memory_for_pool() const;
// Determine whether to perform graph optimization.
bool enable_ir_optim = true;
void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
bool ir_optim() const { return enable_ir_optim_; }
// Get a pass builder for customize the passes in IR analysis phase.
PassStrategy* pass_builder() const;
void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
// NOT stable yet.
bool use_feed_fetch_ops{true};
void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
bool specify_input_name() const { return specify_input_name_; }
void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3);
bool use_tensorrt() const { return use_tensorrt_; }
bool tensorrt_engine_enabled() const { return use_tensorrt_; }
void SwitchIrDebug(int x = true) { ir_debug_ = x; }
void EnableMKLDNN();
bool use_mkldnn() const { return use_mkldnn_; }
bool mkldnn_enabled() const { return use_mkldnn_; }
// Set and get the number of cpu math library threads.
void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
int cpu_math_library_num_threads() const {
return cpu_math_library_num_threads_;
}
NativeConfig ToNativeConfig() const {
NativeConfig config;
config.model_dir = model_dir_;
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
}
void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
mkldnn_enabled_op_types_ = op_list;
}
@ -65,10 +106,29 @@ struct AnalysisConfig : public NativeConfig {
friend class ::paddle::AnalysisPredictor;
// NOTE just for developer, not an official API, easily to be broken.
// Get a pass builder for customize the passes in IR analysis phase.
PassStrategy* pass_builder() const;
protected:
// Update the config.
void Update();
std::string SerializeInfoCache();
protected:
// Model pathes.
std::string model_dir_;
std::string prog_file_;
std::string params_file_;
// GPU releated.
bool use_gpu_{false};
int device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
// TensorRT releated.
bool use_tensorrt_{false};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int tensorrt_workspace_size_;
@ -82,17 +142,24 @@ struct AnalysisConfig : public NativeConfig {
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3};
std::unique_ptr<PassStrategy> pass_builder_;
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
bool model_from_memory_{false};
};
// Configurations for Anakin engine.
struct AnakinConfig : public PaddlePredictor::Config {
enum TargetType { NVGPU = 0, X86 };
int device;
std::string model_file;
int max_batch_size{-1};
TargetType target_type;
bool enable_ir_optim_{true};
bool use_feed_fetch_ops_{true};
bool ir_debug_{false};
bool specify_input_name_{false};
int cpu_math_library_num_threads_{1};
// A runtime cache, shouldn't be transferred to others.
std::string serialized_info_cache_;
mutable std::unique_ptr<PassStrategy> pass_builder_;
};
} // namespace contrib

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save