merge develop

upload-readme
nhzlx 6 years ago
commit cc4a7661c6

@ -140,5 +140,11 @@ def parse_args():
'--use_lars', '--use_lars',
action='store_true', action='store_true',
help='If set, use lars for optimizers, ONLY support resnet module.') help='If set, use lars for optimizers, ONLY support resnet module.')
parser.add_argument(
'--reduce_strategy',
type=str,
choices=['reduce', 'all_reduce'],
default='all_reduce',
help='Specify the reduce strategy, can be reduce, all_reduce')
args = parser.parse_args() args = parser.parse_args()
return args return args

@ -170,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
strategy = fluid.ExecutionStrategy() strategy = fluid.ExecutionStrategy()
strategy.num_threads = args.cpus strategy.num_threads = args.cpus
strategy.allow_op_delay = False strategy.allow_op_delay = False
build_strategy = fluid.BuildStrategy()
if args.reduce_strategy == "reduce":
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.Reduce
else:
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce
avg_loss = train_args[0] avg_loss = train_args[0]
if args.update_method == "pserver": if args.update_method == "pserver":
@ -184,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
avg_loss.name, avg_loss.name,
main_program=train_prog, main_program=train_prog,
exec_strategy=strategy, exec_strategy=strategy,
build_strategy=build_strategy,
num_trainers=num_trainers, num_trainers=num_trainers,
trainer_id=trainer_id) trainer_id=trainer_id)

@ -67,11 +67,14 @@ def cnn_model(data):
def get_model(args, is_train, main_prog, startup_prog): def get_model(args, is_train, main_prog, startup_prog):
# NOTE: mnist is small, we don't implement data sharding yet. # NOTE: mnist is small, we don't implement data sharding yet.
filelist = [ opt = None
os.path.join(args.data_path, f) for f in os.listdir(args.data_path) data_file_handle = None
]
with fluid.program_guard(main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op: if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f)
for f in os.listdir(args.data_path)
]
data_file_handle = fluid.layers.open_files( data_file_handle = fluid.layers.open_files(
filenames=filelist, filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)], shapes=[[-1, 1, 28, 28], (-1, 1)],
@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog):
if is_train: if is_train:
opt = fluid.optimizer.AdamOptimizer( opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999) learning_rate=0.001, beta1=0.9, beta2=0.999)
opt.minimize() opt.minimize(avg_cost)
if args.memory_optimize: if args.memory_optimize:
fluid.memory_optimize(main_prog) fluid.memory_optimize(main_prog)

@ -207,7 +207,7 @@ def get_model(args, is_train, main_prog, startup_prog):
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1) step = int(total_images / (args.batch_size * args.gpus) + 1)
epochs = [30, 60, 90] epochs = [30, 60, 90]
bd = [step * e for e in epochs] bd = [step * e for e in epochs]
base_lr = args.learning_rate base_lr = args.learning_rate

@ -16,7 +16,9 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
DOC "Path to TensorRT library.") DOC "Path to TensorRT library.")
if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY) if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
if(WITH_DSO)
set(TENSORRT_FOUND ON) set(TENSORRT_FOUND ON)
endif(WITH DSO)
else() else()
set(TENSORRT_FOUND OFF) set(TENSORRT_FOUND OFF)
endif() endif()

@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
paddle.fluid.InferenceTranspiler.__init__ paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg
paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)) paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)) paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None)) paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
paddle.fluid.transpiler.InferenceTranspiler.__init__ paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))

@ -46,7 +46,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif #endif
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent r("all_reduce", nullptr); platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (NoDummyInputSize() == 1) { if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1; return; // No need to all reduce when GPU count = 1;
} else { } else {

@ -15,12 +15,15 @@
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
void BroadcastOpHandle::RunImpl() { void BroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (places_.size() == 1) return; if (places_.size() == 1) return;
// The input and output may have dummy vars. // The input and output may have dummy vars.

@ -348,14 +348,31 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
size_t cur_device_id = 0; size_t cur_device_id = 0;
bool is_forwarding = true; bool is_forwarding = true;
bool is_dist_train = false;
for (ir::Node *node : sorted_ops) { for (ir::Node *node : sorted_ops) {
if (boost::get<int>( if (boost::get<int>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kRPC)) { static_cast<int>(OpRole::kRPC)) {
CreateRPCOp(&result, node); int op_dev_id = CreateRPCOp(&result, node);
PADDLE_ENFORCE(op_dev_id != -1,
"Can not schedule the RPC operator to the right place.");
if (node->Op()->Type() == "recv") {
auto recv_vars_attr =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient]
if (recv_vars_attr[0].find(".block") == std::string::npos) {
bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
}
}
is_dist_train = true;
} else if (IsDistTrainOp(node, send_vars, recv_vars)) { } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
CreateDistTrainOp(&result, node); int op_dev_id = CreateDistTrainOp(&result, node);
if (node->Op()->Type() == "concat") {
auto origin_param_name = node->Op()->OutputArgumentNames()[0];
bcast_var_name_set[op_dev_id].emplace(origin_param_name);
}
} else if (IsScaleLossOp(node)) { } else if (IsScaleLossOp(node)) {
// user can customize loss@grad if not use_default_grad_scale_ // user can customize loss@grad if not use_default_grad_scale_
if (strategy_.gradient_scale_ != if (strategy_.gradient_scale_ !=
@ -414,7 +431,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
CreateReduceOp(&result, g_name, cur_device_id); CreateReduceOp(&result, g_name, cur_device_id);
graph->Get<ShardedVarDevice>(kShardedVarDevice) graph->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(g_name, cur_device_id); .emplace(g_name, cur_device_id);
bcast_var_name_set[cur_device_id].emplace(p_name); if (!is_dist_train) {
bcast_var_name_set[cur_device_id].emplace(p_name);
}
break; break;
case BuildStrategy::ReduceStrategy::kAllReduce: case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(g_name)) { if (IsSparseGradient(g_name)) {
@ -436,14 +455,19 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
} }
} }
} }
bool use_gpu = false; bool use_gpu = false;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
use_gpu = nccl_ctxs_ != nullptr; use_gpu = nccl_ctxs_ != nullptr;
#endif #endif
if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { // Insert broadcast operators principle:
// Insert BCast Ops // 1. Broadcast optimized parameters in Reduce strategy;
// 2. No need broadcast optimized parameters in AllReduce strategy because of
// the optimization sub-graph would be run on every GPU;
// 3. Allways broadcast received parameters in Distribute Training.
if ((use_gpu &&
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
is_dist_train) {
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
auto &to_bcast_set = bcast_var_name_set[dev_id]; auto &to_bcast_set = bcast_var_name_set[dev_id];
for (auto &bcast_name : to_bcast_set) { for (auto &bcast_name : to_bcast_set) {
@ -675,8 +699,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
return var; return var;
} }
void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
ir::Node *node) const { ir::Node *node) const {
int op_dev_id = -1; int op_dev_id = -1;
std::vector<std::string> input_var_names; std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names; std::vector<std::string> output_var_names;
@ -719,6 +743,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
node->Op()->Type()); node->Op()->Type());
CreateComputationalOp(result, node, op_dev_id); CreateComputationalOp(result, node, op_dev_id);
return op_dev_id;
} }
void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
@ -737,8 +762,8 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
} }
// Create RPC related op handles that connects its in ops and out ops. // Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
ir::Node *node) const { ir::Node *node) const {
int op_dev_id = -1; int op_dev_id = -1;
if (node->Op()->Type() == "send") { if (node->Op()->Type() == "send") {
// TODO(paddle-dev): getting the first var is not safe. // TODO(paddle-dev): getting the first var is not safe.
@ -824,6 +849,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id); CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
} }
} }
return op_dev_id;
} }
bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {

@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
bool IsScaleLossOp(ir::Node *node) const; bool IsScaleLossOp(ir::Node *node) const;
void CreateRPCOp(ir::Graph *result, ir::Node *node) const; int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
/** /**
* Is this operator as the end-point operator before/after send operator. * Is this operator as the end-point operator before/after send operator.

@ -27,7 +27,8 @@ namespace framework {
namespace details { namespace details {
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
platform::RecordEvent r("reduce", nullptr); platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (places_.size() == 1) return; if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(inputs_);

@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->stream(); ->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream); platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(1) << place_ << "RUN Scale loss grad op"; VLOG(10) << place_ << "RUN Scale loss grad op";
}); });
#endif #endif
} }

@ -58,7 +58,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in})); desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in})); desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in})); desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
desc.SetOutput("Out", std::vector<std::string>({conv_relu_out})); desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
desc.SetType("conv2d"); desc.SetType("conv2d");
for (auto& attr : conv->Op()->GetAttrMap()) { for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second); desc.SetAttr(attr.first, attr.second);

@ -429,7 +429,7 @@ struct LSTM : public PatternBase {
struct GRU : public PatternBase { struct GRU : public PatternBase {
GRU(PDPattern* pattern, const std::string& name_scope) GRU(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "lstm") {} : PatternBase(pattern, name_scope, "gru") {}
PDNode* operator()(PDNode* x); PDNode* operator()(PDNode* x);

@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
"mul_gru_fuse_pass", // "mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", // "seq_concat_fc_fuse_pass", //
"fc_fuse_pass", // "fc_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN
"conv_relu_mkldnn_fuse_pass", //
#endif
}}; }};
std::unordered_set<std::string> disabled_ir_passes_; std::unordered_set<std::string> disabled_ir_passes_;

@ -120,13 +120,20 @@ void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
outputs.insert(node); outputs.insert(node);
} }
// update the dst and src node's inlinks and outlinks. // update the dst and src node's inlinks and outlinks.
#ifdef __clang__
src_node->inlinks = std::vector<BriefNode *>(inputs.begin(), inputs.end());
src_node->outlinks = std::vector<BriefNode *>(outputs.begin(), outputs.end());
dst_node->inlinks.clear();
dst_node->outlinks.clear();
#else
src_node->inlinks = src_node->inlinks =
std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end())); std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end()));
src_node->outlinks = src_node->outlinks =
std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end())); std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end()));
dst_node->inlinks.clear(); dst_node->inlinks.clear();
dst_node->outlinks.clear(); dst_node->outlinks.clear();
#endif
auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) { auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) {
for (auto *&n : nodes) { for (auto *&n : nodes) {

@ -77,6 +77,9 @@ bool AnalysisPredictor::Init(
OptimizeInferenceProgram(); OptimizeInferenceProgram();
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
VLOG(5) << "to create variables"; VLOG(5) << "to create variables";
PADDLE_ENFORCE(scope_.get()); PADDLE_ENFORCE(scope_.get());

@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
@ -64,13 +64,15 @@ PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
void PaddleBuf::Resize(size_t length) { void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed. // Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return; if (length_ >= length) return;
if (memory_owned_) { if (memory_owned_) {
Free(); Free();
data_ = malloc(length);
length_ = length;
memory_owned_ = true;
} else {
PADDLE_THROW("The memory is allocated externally, can not Resized");
} }
data_ = new char[length];
length_ = length;
memory_owned_ = true;
} }
void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Reset(void* data, size_t length) {
@ -82,8 +84,8 @@ void PaddleBuf::Reset(void* data, size_t length) {
void PaddleBuf::Free() { void PaddleBuf::Free() {
if (memory_owned_ && data_) { if (memory_owned_ && data_) {
assert(length_ > 0); PADDLE_ENFORCE_GT(length_, 0);
delete[] static_cast<char*>(data_); free(static_cast<char*>(data_));
data_ = nullptr; data_ = nullptr;
length_ = 0; length_ = 0;
} }

@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init(
} }
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
executor_->CreateVariables(*inference_program_, executor_->CreateVariables(*inference_program_,
sub_scope_ ? sub_scope_ : scope_.get(), 0); sub_scope_ ? sub_scope_ : scope_.get(), 0);

@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
} }
void PrintTime(int batch_size, int repeat, int num_threads, int tid, void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency) { double latency, int epoch = 1) {
LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
<< ", threads: " << num_threads << ", thread id: " << tid << ", threads: " << num_threads << ", thread id: " << tid
<< ", latency: " << latency << "ms ======"; << ", latency: " << latency << "ms ======";
if (epoch > 1) {
int samples = batch_size * epoch;
LOG(INFO) << "====== sample number: " << samples
<< ", average latency of each sample: " << latency / samples
<< "ms ======";
}
} }
} // namespace inference } // namespace inference

@ -45,7 +45,7 @@ class PaddleBuf {
PaddleBuf(void* data, size_t length) PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {} : data_(data), length_(length), memory_owned_{false} {}
// Own memory. // Own memory.
PaddleBuf(size_t length) explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes. // Resize to `length` bytes.
void Resize(size_t length); void Resize(size_t length);
@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config {
bool use_gpu{false}; bool use_gpu{false};
int device{0}; int device{0};
float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization. float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization.
// NOTE: NOT use it, just for the internal test, will discard later
bool _use_mkldnn{false};
// Specify the variable's name of each input. // Specify the variable's name of each input.
bool specify_input_name{false}; bool specify_input_name{false};

@ -1,57 +1,77 @@
function (inference_download_and_uncompress install_dir url) set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
get_filename_component(filename ${url} NAME) set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo")
message(STATUS "Download inference test stuff ${filename} from ${url}") set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
function (inference_download_and_uncompress install_dir filename)
message(STATUS "Download inference test stuff from ${INFERENCE_URL}/${filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}") execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${INFERENCE_URL}/${filename}")
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
message(STATUS "finish downloading ${filename}") message(STATUS "finish downloading ${filename}")
endfunction(inference_download_and_uncompress) endfunction(inference_download_and_uncompress)
function(download_model_and_data install_dir model_url data_url) function(download_model_and_data install_dir model_name data_name)
if (NOT EXISTS ${install_dir} AND WITH_INFERENCE) if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
inference_download_and_uncompress(${install_dir} ${model_url}) inference_download_and_uncompress(${install_dir} ${model_name})
inference_download_and_uncompress(${install_dir} ${data_url}) inference_download_and_uncompress(${install_dir} ${data_name})
endif() endif()
endfunction() endfunction()
# RNN1 # RNN1
set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz") # TODO: fix this test on MACOS
set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz") message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1")
set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1") if(NOT APPLE)
download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} ${RNN1_DATA_URL}) set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
ARGS --infer_model=${RNN1_INSTALL_DIR}/model EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
--infer_data=${RNN1_INSTALL_DIR}/data.txt) ARGS --infer_model=${RNN1_INSTALL_DIR}/model
--infer_data=${RNN1_INSTALL_DIR}/data.txt)
endif(NOT APPLE)
# RNN2
set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
inference_analysis_test(test_analyzer_rnn2 SRCS analyzer_rnn2_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${RNN2_INSTALL_DIR}/model
--infer_data=${RNN2_INSTALL_DIR}/data.txt)
# chinese_ner # chinese_ner
set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner")
download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} ${CHINESE_NER_DATA_URL})
inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
--infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
# lac # lac
set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac")
download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} ${LAC_DATA_URL})
inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${LAC_INSTALL_DIR}/model ARGS --infer_model=${LAC_INSTALL_DIR}/model
--infer_data=${LAC_INSTALL_DIR}/data.txt) --infer_data=${LAC_INSTALL_DIR}/data.txt)
# text_classification # text_classification
set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification") inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc
download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL}) EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/model
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt)
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
--infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt # ocr
--topn=1 # Just run top 1 batch. set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz")
) set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE)
get_filename_component(filename ${OCR_MODEL_URL} NAME)
message(STATUS "Download inference test stuff ${filename} from ${OCR_MODEL_URL}")
execute_process(COMMAND bash -c "mkdir -p ${OCR_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && wget -q ${OCR_MODEL_URL}")
execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && tar xzf ${filename}")
message(STATUS "finish downloading ${filename}")
endif()
inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${OCR_INSTALL_DIR}/model
--infer_data=${OCR_INSTALL_DIR}/data.txt)

@ -12,21 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path for LAC");
DEFINE_string(infer_data, "", "data file for LAC");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
@ -124,48 +110,38 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
void TestLACPrediction(const std::string &model_path, void TestLACPrediction(const std::string &model_path,
const std::string &data_file, const int batch_size, const std::string &data_file, const int batch_size,
const int repeat, bool test_all_data, const int repeat, bool use_analysis = false) {
bool use_analysis = false) { AnalysisConfig cfg;
NativeConfig config; cfg.model_dir = model_path;
config.model_dir = model_path; cfg.use_gpu = false;
config.use_gpu = false; cfg.device = 0;
config.device = 0; cfg.specify_input_name = true;
config.specify_input_name = true; cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs_slots; std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size); DataRecord data(data_file, batch_size);
GetOneBatch(&input_slots, &data, batch_size); GetOneBatch(&input_slots, &data, batch_size);
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
if (use_analysis) { if (use_analysis) {
AnalysisConfig cfg;
cfg.model_dir = model_path;
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
predictor = predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg); CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else { } else {
predictor = predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
} }
for (int i = 0; i < FLAGS_burning; i++) { for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots); predictor->Run(input_slots, &outputs_slots);
} }
Timer timer; Timer timer;
if (test_all_data) { if (FLAGS_test_all_data) {
double sum = 0; LOG(INFO) << "test all data";
LOG(INFO) << "Total number of samples: " << data.datasets.size(); std::vector<std::vector<PaddleTensor>> input_slots_all;
for (int i = 0; i < repeat; i++) { for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { GetOneBatch(&input_slots, &data, batch_size);
GetOneBatch(&input_slots, &data, batch_size); input_slots_all.emplace_back(input_slots);
timer.tic();
predictor->Run(input_slots, &outputs_slots);
sum += timer.toc();
}
} }
PrintTime(batch_size, repeat, 1, 0, sum / repeat); LOG(INFO) << "total number of samples: " << data.datasets.size();
LOG(INFO) << "Average latency of each sample: " TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
<< sum / repeat / data.datasets.size() << " ms";
return; return;
} }
timer.tic(); timer.tic();
@ -190,19 +166,10 @@ void TestLACPrediction(const std::string &model_path,
if (use_analysis) { if (use_analysis) {
// run once for comparion as reference // run once for comparion as reference
auto ref_predictor = auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots; std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots); ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); CompareResult(ref_outputs_slots, outputs_slots);
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], pdata[i]);
}
AnalysisPredictor *analysis_predictor = AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get()); dynamic_cast<AnalysisPredictor *>(predictor.get());
@ -231,13 +198,13 @@ void TestLACPrediction(const std::string &model_path,
TEST(Analyzer_LAC, native) { TEST(Analyzer_LAC, native) {
LOG(INFO) << "LAC with native"; LOG(INFO) << "LAC with native";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data); FLAGS_repeat);
} }
TEST(Analyzer_LAC, analysis) { TEST(Analyzer_LAC, analysis) {
LOG(INFO) << "LAC with analysis"; LOG(INFO) << "LAC with analysis";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data, true); FLAGS_repeat, true);
} }
} // namespace analysis } // namespace analysis

@ -12,20 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
@ -113,50 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
48, 39, 38, 16, 25}; 48, 39, 38, 16, 25};
void TestChineseNERPrediction(bool use_analysis) { void TestChineseNERPrediction(bool use_analysis) {
NativeConfig config; AnalysisConfig cfg;
config.prog_file = FLAGS_infer_model + "/__model__"; cfg.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param"; cfg.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false; cfg.use_gpu = false;
config.device = 0; cfg.device = 0;
config.specify_input_name = true; cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs; std::vector<PaddleTensor> input_slots, outputs;
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
Timer timer; Timer timer;
if (use_analysis) { if (use_analysis) {
AnalysisConfig cfg;
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.param_file = FLAGS_infer_model + "/param";
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
predictor = predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg); CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else { } else {
predictor = predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
} }
if (FLAGS_test_all_data) { if (FLAGS_test_all_data) {
LOG(INFO) << "test all data"; LOG(INFO) << "test all data";
double sum = 0; DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
size_t num_samples; std::vector<std::vector<PaddleTensor>> input_slots_all;
for (int i = 0; i < FLAGS_repeat; i++) { for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size); PrepareInputs(&input_slots, &data, FLAGS_batch_size);
// Just one batch, the num_samples remains the same. input_slots_all.emplace_back(input_slots);
num_samples = data.num_samples;
for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
timer.tic();
predictor->Run(input_slots, &outputs);
sum += timer.toc();
}
} }
LOG(INFO) << "total number of samples: " << num_samples; LOG(INFO) << "total number of samples: " << data.num_samples;
PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
LOG(INFO) << "average latency of each sample: "
<< sum / FLAGS_repeat / num_samples;
return; return;
} }
// Prepare inputs. // Prepare inputs.
@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) {
if (use_analysis) { if (use_analysis) {
// run once for comparion as reference // run once for comparion as reference
auto ref_predictor = auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots; std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots); ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs.size()); CompareResult(ref_outputs_slots, outputs);
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], result[i]);
}
AnalysisPredictor *analysis_predictor = AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get()); dynamic_cast<AnalysisPredictor *>(predictor.get());

@ -12,24 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
@ -164,26 +147,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &base_outputs) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &base_out = base_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
}
}
}
// Test with a really complicate model. // Test with a really complicate model.
void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
AnalysisConfig config; AnalysisConfig config;
@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
config.ir_passes.clear(); // Do not exclude any pass. config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor = auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
base_predictor->Run(input_slots, &base_outputs); base_predictor->Run(input_slots, &base_outputs);
std::vector<std::vector<PaddleTensor>> input_slots_all;
input_slots_all.emplace_back(input_slots);
if (num_threads == 1) { if (num_threads == 1) {
// Prepare inputs. TestOneThreadPrediction(config, input_slots_all, &outputs);
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
CompareResult(outputs, base_outputs); CompareResult(outputs, base_outputs);
} else { } else {
std::vector<std::thread> threads; // only return the output of first thread
std::vector<std::unique_ptr<PaddlePredictor>> predictors; TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
}
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local input_slots and outputs.
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictors[tid]->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times);
CompareResult(outputs, base_outputs);
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
}
} }
if (use_analysis && activate_ir) { if (use_analysis && activate_ir) {
@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) {
// Directly infer with the original model. // Directly infer with the original model.
TestRNN1Prediction(false, false, i); TestRNN1Prediction(false, false, i);
// Inference with the original model with the analysis turned on, the // Inference with the original model with the analysis turned on, the
// analysis // analysis module will transform the program to a data flow graph.
// module will transform the program to a data flow graph.
TestRNN1Prediction(true, false, i); TestRNN1Prediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large // Inference with analysis and IR. The IR module will fuse some large
// kernels. // kernels.

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save