From d83187dba87bf106abb71ed559f645cc79a7933a Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 4 Sep 2018 11:07:43 +0800 Subject: [PATCH 01/13] enable lac analysis test --- .../fluid/inference/analysis/CMakeLists.txt | 14 +++- .../inference/analysis/analyzer_lac_tester.cc | 70 ++++++++++++++++--- paddle/fluid/inference/api/CMakeLists.txt | 2 +- 3 files changed, 75 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index cc0dd0d492..eb4908da24 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -105,6 +105,18 @@ if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING) endif() inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis + analysis_predictor + # ir + fc_fuse_pass + fc_lstm_fuse_pass + seq_concat_fc_fuse_pass + graph_viz_pass + infer_clean_graph_pass + graph_pattern_detector + infer_clean_graph_pass + attention_lstm_fuse_pass + paddle_inference_api + pass ARGS --infer_model=${LAC_INSTALL_DIR}/model --infer_data=${LAC_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc index e2f7253ac0..2aef25603f 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "paddle/fluid/inference/analysis/analyzer.h" -#include #include #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" @@ -102,6 +102,7 @@ struct DataRecord { return data; } }; + void GetOneBatch(std::vector *input_slots, DataRecord *data, int batch_size) { auto one_batch = data->NextBatch(); @@ -114,12 +115,14 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, PADDLE_ENFORCE_EQ(batch_size, static_cast(one_batch.lod.size() - 1)); input_slots->assign({input_tensor}); } + static void PrintTime(const double latency, const int bs, const int repeat) { LOG(INFO) << "===========profile result==========="; LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat << ", avg latency: " << latency / repeat << "ms"; LOG(INFO) << "====================================="; } + void BenchAllData(const std::string &model_path, const std::string &data_file, const int batch_size, const int repeat) { NativeConfig config; @@ -147,36 +150,64 @@ void BenchAllData(const std::string &model_path, const std::string &data_file, } PrintTime(sum, batch_size, repeat); } + const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23}; + void TestLACPrediction(const std::string &model_path, const std::string &data_file, const int batch_size, - const int repeat, bool test_all_data) { - if (test_all_data) { - BenchAllData(model_path, data_file, batch_size, repeat); - return; - } + const int repeat, bool test_all_data, + bool use_analysis = false) { NativeConfig config; config.model_dir = model_path; config.use_gpu = false; config.device = 0; config.specify_input_name = true; - std::vector input_slots, outputs_slots; + std::vector input_slots, outputs_slots, ref_outputs_slots; DataRecord data(data_file, batch_size); GetOneBatch(&input_slots, &data, batch_size); - auto predictor = - CreatePaddlePredictor(config); + std::unique_ptr predictor; + if (use_analysis) { + predictor = + CreatePaddlePredictor( + config); + } else { + predictor = + CreatePaddlePredictor(config); + } for (int i = 0; i < FLAGS_burning; i++) { predictor->Run(input_slots, &outputs_slots); } Timer timer; + if (test_all_data) { + double sum = 0; + for (int i = 0; i < repeat; i++) { + for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { + GetOneBatch(&input_slots, &data, batch_size); + timer.tic(); + predictor->Run(input_slots, &outputs_slots); + sum += timer.toc(); + } + } + PrintTime(sum, batch_size, repeat); + return; + } timer.tic(); for (int i = 0; i < repeat; i++) { predictor->Run(input_slots, &outputs_slots); } PrintTime(timer.toc(), batch_size, repeat); + + // check result + if (use_analysis) { + // run once for comparion as reference + auto ref_predictor = + CreatePaddlePredictor(config); + ref_predictor->Run(input_slots, &ref_outputs_slots); + } + EXPECT_EQ(outputs_slots.size(), 1UL); auto &out = outputs_slots[0]; size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, @@ -188,12 +219,33 @@ void TestLACPrediction(const std::string &model_path, for (size_t i = 0; i < batch1_size; ++i) { EXPECT_EQ(pdata[i], lac_ref_data[i]); } + + if (use_analysis) { + EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); + auto &ref_out = ref_outputs_slots[0]; + size_t ref_size = + std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, + [](int a, int b) { return a * b; }); + EXPECT_EQ(size, ref_size); + int64_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t i = 0; i < size; ++i) { + EXPECT_EQ(pdata_ref[i], pdata[i]); + } + } } + TEST(Analyzer_LAC, native) { LOG(INFO) << "LAC with native"; TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, FLAGS_repeat, FLAGS_test_all_data); } + +TEST(Analyzer_LAC, analysis) { + LOG(INFO) << "LAC with analysis"; + TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, + FLAGS_repeat, FLAGS_test_all_data, true); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index adfe439244..a94c79a698 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -47,7 +47,7 @@ function(inference_api_test TARGET_NAME) endfunction(inference_api_test) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) cc_test(test_paddle_inference_api SRCS api_tester.cc From 09016df8df61cff85a58c0dfd5a29e4feb575a97 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 5 Sep 2018 21:03:53 +0800 Subject: [PATCH 02/13] make analyzer run --- paddle/fluid/inference/analysis/CMakeLists.txt | 14 +------------- .../inference/analysis/analyzer_lac_tester.cc | 10 ++++++++-- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 43201fb0bb..dce74ee3f9 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -93,19 +93,7 @@ if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) endif() inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis - analysis_predictor - # ir - fc_fuse_pass - fc_lstm_fuse_pass - seq_concat_fc_fuse_pass - graph_viz_pass - infer_clean_graph_pass - graph_pattern_detector - infer_clean_graph_pass - attention_lstm_fuse_pass - paddle_inference_api - pass + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor ARGS --infer_model=${LAC_INSTALL_DIR}/model --infer_data=${LAC_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc index 2aef25603f..5efee95030 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc @@ -16,6 +16,7 @@ #include #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/profiler.h" @@ -170,9 +171,14 @@ void TestLACPrediction(const std::string &model_path, GetOneBatch(&input_slots, &data, batch_size); std::unique_ptr predictor; if (use_analysis) { + AnalysisConfig cfg; + cfg.model_dir = model_path; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; predictor = - CreatePaddlePredictor( - config); + CreatePaddlePredictor(cfg); } else { predictor = CreatePaddlePredictor(config); From f057077c3a7b3c36ea0728d849ec91e5af7814bf Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 5 Sep 2018 23:38:58 +0800 Subject: [PATCH 03/13] add fuse fc gru pass --- paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 193 ++++++++++++++++++ paddle/fluid/framework/ir/fc_gru_fuse_pass.h | 50 +++++ .../framework/ir/graph_pattern_detector.cc | 27 +++ .../framework/ir/graph_pattern_detector.h | 2 + 4 files changed, 272 insertions(+) create mode 100644 paddle/fluid/framework/ir/fc_gru_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/fc_gru_fuse_pass.h diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc new file mode 100644 index 0000000000..1e7b49620c --- /dev/null +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h" +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::string GenNodeName(const std::string& prefix, const std::string& name) { + return prefix + "/" + name; +} + +void BuildPattern(PDPattern* pattern, const std::string& name_scope, + bool with_fc_bias) { + PDNode* x = pattern->NewNode(name_scope, "x") + ->assert_is_op_input("mul") + ->assert_var_not_persistable(); + auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias); + fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse. + patterns::GRU(pattern, name_scope, fc_out); + VLOG(3) << "\n" << pattern->DotString(); +} + +int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, + bool with_fc_bias) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + BuildPattern(pattern, name_scope, with_fc_bias); + + // Create New OpDesc + auto gru_creater = [&](int gru, int x, int weight_x, int weight_h, int bias, + int hidden, int fc_bias) { +#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x); + GET_NODE(x); + GET_NODE(weight_x); + GET_NODE(weight_h); + GET_NODE(bias); + GET_NODE(hidden); + GET_NODE(gru); + + OpDesc op_desc; + op_desc.SetType("fusion_gru"); +#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()}); + SET_IN(X, x); + SET_IN(WeightX, weight_x); + SET_IN(WeightH, weight_h); + SET_IN(Bias, bias); +#undef SET_IN + if (with_fc_bias) { + // Add FC-bias with LSTM-bias and create a new weight + PADDLE_ENFORCE(scope); + const std::string& new_bias_var = name_scope + "_bias.new"; + auto* bias_var = scope->Var(new_bias_var); + PADDLE_ENFORCE(bias_var); + auto* bias_tensor = bias_var->GetMutable(); + auto* gru_bias_var = scope->FindVar(bias_n->Name()); + PADDLE_ENFORCE(gru_bias_var); + const auto& gru_bias_tenosr = gru_bias_var->Get(); + bias_tensor->Resize(gru_bias_tenosr.dims()); + + GET_NODE(fc_bias); + auto* fc_bias_var = scope->FindVar(fc_bias_n->Name()); + const auto& fc_bias_tensor = fc_bias_var->Get(); + // new bias = fc bias + gru bias + auto* data = bias_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < bias_tensor->numel(); i++) { + data[i] = + fc_bias_tensor.data()[i] + gru_bias_tenosr.data()[i]; + } + op_desc.SetInput("Bias", {new_bias_var}); + } +#undef GET_NODE + + op_desc.SetInput("H0", {}); + op_desc.SetOutput("Hidden", {hidden_n->Name()}); + op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse")); + // TODO(TJ): This should be a option for infer + op_desc.SetAttr("use_seq", true); + + // Create temp variables. + // TODO(TJ): clean code + scope->Var(name_scope + "/ReorderedH0.new") + ->GetMutable(); + scope->Var(name_scope + "/XX.new")->GetMutable(); + scope->Var(name_scope + "/BatchedInput.new") + ->GetMutable(); + scope->Var(name_scope + "/BatchedOut.new") + ->GetMutable(); + op_desc.SetOutput("ReorderedH0", {name_scope + "/ReorderedH0.new"}); + op_desc.SetOutput("XX", {name_scope + "/XX.new"}); + op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"}); + op_desc.SetOutput("BatchedOut", {name_scope + "/BatchedOut.new"}); + + auto* op = graph->CreateOpNode(&op_desc); + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto* scope = graph->Get(kParamScopeAttr); + + IR_NODE_LINK_TO(x_n, op); + IR_NODE_LINK_TO(weight_x_n, op); + IR_NODE_LINK_TO(weight_h_n, op); + IR_NODE_LINK_TO(bias_n, op); + IR_NODE_LINK_TO(op, hidden_n); + // h0? + return op; + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { +#define GET_NODE(name__) \ + std::string name__##key = name_scope + "/" + #name__; \ + auto* name__##n = pattern->RetrieveNode(name__##key); \ + PADDLE_ENFORCE(name__##n); \ + PADDLE_ENFORCE(subgraph.count(name__##n)); \ + Node* name__##_n = subgraph.at(name__##n); \ + int name__ __attribute__((unused)) = name__##_n->id(); + + GET_NODE(x); + GET_NODE(w); + GET_NODE(mul); + GET_NODE(fc_out); + GET_NODE(Weight); + GET_NODE(gru); + GET_NODE(Bias); + GET_NODE(Hidden); + + if (with_fc_bias) { + GET_NODE(fc_bias); + GET_NODE(elementwise_add); + gru_creater(gru, x, w, Weight, Bias, Hidden, fc_bias); + // Remove unneeded nodes. + std::unordered_set marked_nodes( + {mul_n, gru_n, elementwise_add_n}); + GraphSafeRemoveNodes(graph, marked_nodes); + } else { + gru_creater(gru, x, w, Weight, Bias, Hidden, -1); + // Remove unneeded nodes. + std::unordered_set marked_nodes({mul_n, gru_n}); + GraphSafeRemoveNodes(graph, marked_nodes); + } +#undef GET_NODE + + ++fusion_count; + }; + + gpd(graph, handler); + + return fusion_count; +} + +std::unique_ptr MulGRUFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init(name_scope_, graph.get()); + + int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), + false /*with_fc_bias*/); + + AddStatis(fusion_count); + return graph; +} + +std::unique_ptr FCGRUFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init(name_scope_, graph.get()); + + int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), + true /*with_fc_bias*/); + + AddStatis(fusion_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulGRUFusePass); +REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCGRUFusePass); diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h new file mode 100644 index 0000000000..63e1c72bfb --- /dev/null +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op. + +class FCGRUFusePass : public FusePassBase { + public: + virtual ~FCGRUFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + const std::string name_scope_{"fc_gru_fuse"}; +}; + +// Just FC without bias +class MulGRUFusePass : public FusePassBase { + public: + virtual ~MulGRUFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"fc_nobias_gru_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 434bee4cce..8dfe36f781 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -565,6 +565,7 @@ PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope, return fc_out; } + PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x) { x->assert_is_op_input("lstm", "Input"); @@ -589,6 +590,32 @@ PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope, lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct}); return Hidden; } + +PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope, + PDNode* x) { + x->assert_is_op_input("gru", "Input"); + auto* gru_op = pattern->NewNode(name_scope, "gru")->assert_is_op("gru"); +#define NEW_NODE(arg__, io__) \ + auto* arg__ = pattern->NewNode(name_scope, #arg__) \ + ->assert_is_op_##io__("gru", #arg__); + + NEW_NODE(Weight, input); + // TODO(Superjomn): upgrade the fuse framework to support optional. + // H0 and bias are optional + NEW_NODE(Bias, input); // also optional + // NEW_NODE(H0, input); + + NEW_NODE(Hidden, output); + // below are intermediate + NEW_NODE(BatchGate, output); + NEW_NODE(BatchResetHiddenPrev, output); + NEW_NODE(BatchHidden, output); + + gru_op->LinksFrom({x, Weight, Bias}); + gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden}); + return Hidden; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index eacea1750f..71e4c36d9b 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -298,6 +298,8 @@ PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x, PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x); +PDNode* GRU(PDPattern* pattern, const std::string& name_scope, PDNode* x); + } // namespace patterns #define IR_NODE_LINK_TO(a, b) \ From 74f95b8da05a6a7f7487222b8f004f40a3156c05 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 6 Sep 2018 11:09:23 +0800 Subject: [PATCH 04/13] fix redefine macro --- .../framework/ir/graph_pattern_detector.cc | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 8dfe36f781..8b1e653ec8 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -566,25 +566,26 @@ PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope, return fc_out; } +#define NEW_NODE(op__, arg__, io__) \ + auto* arg__ = pattern->NewNode(name_scope, #arg__) \ + ->assert_is_op_##io__(#op__, #arg__); + PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x) { x->assert_is_op_input("lstm", "Input"); auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm"); -#define NEW_NODE(arg__, io__) \ - auto* arg__ = pattern->NewNode(name_scope, #arg__) \ - ->assert_is_op_##io__("lstm", #arg__); // Currently, the H0 and C0 are optional // TODO(Superjomn) upgrade the fuse framework to support optional. // NEW_NODE(H0, input); // NEW_NODE(C0, input); - NEW_NODE(Weight, input); - NEW_NODE(Bias, input); + NEW_NODE(lstm, Weight, input); + NEW_NODE(lstm, Bias, input); - NEW_NODE(Hidden, output); - NEW_NODE(Cell, output); - NEW_NODE(BatchGate, output); - NEW_NODE(BatchCellPreAct, output); + NEW_NODE(lstm, Hidden, output); + NEW_NODE(lstm, Cell, output); + NEW_NODE(lstm, BatchGate, output); + NEW_NODE(lstm, BatchCellPreAct, output); lstm_op->LinksFrom({x, Weight, Bias}); lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct}); @@ -595,26 +596,24 @@ PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope, PDNode* x) { x->assert_is_op_input("gru", "Input"); auto* gru_op = pattern->NewNode(name_scope, "gru")->assert_is_op("gru"); -#define NEW_NODE(arg__, io__) \ - auto* arg__ = pattern->NewNode(name_scope, #arg__) \ - ->assert_is_op_##io__("gru", #arg__); - NEW_NODE(Weight, input); + NEW_NODE(gru, Weight, input); // TODO(Superjomn): upgrade the fuse framework to support optional. // H0 and bias are optional - NEW_NODE(Bias, input); // also optional + NEW_NODE(gru, Bias, input); // also optional // NEW_NODE(H0, input); - NEW_NODE(Hidden, output); + NEW_NODE(gru, Hidden, output); // below are intermediate - NEW_NODE(BatchGate, output); - NEW_NODE(BatchResetHiddenPrev, output); - NEW_NODE(BatchHidden, output); + NEW_NODE(gru, BatchGate, output); + NEW_NODE(gru, BatchResetHiddenPrev, output); + NEW_NODE(gru, BatchHidden, output); gru_op->LinksFrom({x, Weight, Bias}); gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden}); return Hidden; } +#undef NEW_NODE } // namespace ir } // namespace framework From 4d774953c6cb584f084129746b4d2aea0e59237a Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 6 Sep 2018 11:53:25 +0800 Subject: [PATCH 05/13] enable fc gru fuse pass --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 18 ++++++------- .../fluid/framework/ir/fc_lstm_fuse_pass.cc | 11 ++++---- paddle/fluid/inference/analysis/analyzer.h | 4 +++ .../inference/analysis/analyzer_lac_tester.cc | 25 +++++++++++++++++++ paddle/fluid/inference/api/CMakeLists.txt | 1 + 6 files changed, 44 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index f5235f70ad..6c7f972589 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -24,6 +24,7 @@ pass_library(fc_fuse_pass) pass_library(attention_lstm_fuse_pass) pass_library(infer_clean_graph_pass) pass_library(fc_lstm_fuse_pass) +pass_library(fc_gru_fuse_pass) pass_library(seq_concat_fc_fuse_pass) set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc index 1e7b49620c..4a08beee7d 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -20,12 +20,8 @@ namespace paddle { namespace framework { namespace ir { -std::string GenNodeName(const std::string& prefix, const std::string& name) { - return prefix + "/" + name; -} - -void BuildPattern(PDPattern* pattern, const std::string& name_scope, - bool with_fc_bias) { +static void BuildPattern(PDPattern* pattern, const std::string& name_scope, + bool with_fc_bias) { PDNode* x = pattern->NewNode(name_scope, "x") ->assert_is_op_input("mul") ->assert_var_not_persistable(); @@ -35,8 +31,8 @@ void BuildPattern(PDPattern* pattern, const std::string& name_scope, VLOG(3) << "\n" << pattern->DotString(); } -int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, - bool with_fc_bias) { +static int BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); @@ -108,7 +104,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, auto* op = graph->CreateOpNode(&op_desc); PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); - auto* scope = graph->Get(kParamScopeAttr); + // auto* scope = graph->Get(kParamScopeAttr); IR_NODE_LINK_TO(x_n, op); IR_NODE_LINK_TO(weight_x_n, op); @@ -189,5 +185,5 @@ std::unique_ptr FCGRUFusePass::ApplyImpl( } // namespace framework } // namespace paddle -REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulGRUFusePass); -REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCGRUFusePass); +REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass); +REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass); diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index 0d69dfa79a..5fa3fcb9dc 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -19,12 +19,13 @@ namespace paddle { namespace framework { namespace ir { -std::string GenNodeName(const std::string& prefix, const std::string& name) { +static std::string GenNodeName(const std::string& prefix, + const std::string& name) { return prefix + "/" + name; } -void BuildPattern(PDPattern* pattern, const std::string& name_scope, - bool with_fc_bias) { +static void BuildPattern(PDPattern* pattern, const std::string& name_scope, + bool with_fc_bias) { PDNode* x = pattern->NewNode(name_scope, "x") ->assert_is_op_input("mul") ->assert_var_not_persistable(); @@ -34,8 +35,8 @@ void BuildPattern(PDPattern* pattern, const std::string& name_scope, // LOG(INFO) << "\n" << pattern->DotString(); } -int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, - bool with_fc_bias) { +static int BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 3fdd2b9ec7..7800fc90b1 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -36,6 +36,8 @@ limitations under the License. */ */ #include +#include +#include #include "paddle/fluid/inference/analysis/flags.h" #include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass_manager.h" @@ -66,6 +68,8 @@ class Analyzer : public OrderedRegistry { "attention_lstm_fuse_pass", // "fc_lstm_fuse_pass", // "mul_lstm_fuse_pass", // + "fc_gru_fuse_pass", // + "mul_gru_fuse_pass", // "seq_concat_fc_fuse_pass", // "fc_fuse_pass", // }}; diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc index 5efee95030..a6e8351c4f 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" @@ -237,6 +238,30 @@ void TestLACPrediction(const std::string &model_path, for (size_t i = 0; i < size; ++i) { EXPECT_EQ(pdata_ref[i], pdata[i]); } + + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); + LOG(INFO) << "fc fuse num:" << fuse_statis.at("fc_fuse"); + LOG(INFO) << "fc gru fuse num:" << fuse_statis.at("fc_gru_fuse"); + + // ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); + // LOG(INFO) << fuse_statis.at("fc_gru_fuse"); } } diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index e976b9397d..330ea04495 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -50,6 +50,7 @@ cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_a pass fc_fuse_pass fc_lstm_fuse_pass + fc_gru_fuse_pass seq_concat_fc_fuse_pass graph_viz_pass infer_clean_graph_pass From 6b104c90d353409c2aacd34321bc6cf5407eb0e5 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 6 Sep 2018 13:51:34 +0800 Subject: [PATCH 06/13] fix profile --- .../inference/analysis/analyzer_lac_tester.cc | 19 +++++++------------ .../fluid/inference/api/analysis_predictor.cc | 13 +++++++++++++ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc index a6e8351c4f..1df1ade25f 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc @@ -178,6 +178,7 @@ void TestLACPrediction(const std::string &model_path, cfg.device = 0; cfg.specify_input_name = true; cfg.enable_ir_optim = true; + cfg.ir_passes.push_back("fc_gru_fuse_pass"); predictor = CreatePaddlePredictor(cfg); } else { @@ -208,13 +209,6 @@ void TestLACPrediction(const std::string &model_path, PrintTime(timer.toc(), batch_size, repeat); // check result - if (use_analysis) { - // run once for comparion as reference - auto ref_predictor = - CreatePaddlePredictor(config); - ref_predictor->Run(input_slots, &ref_outputs_slots); - } - EXPECT_EQ(outputs_slots.size(), 1UL); auto &out = outputs_slots[0]; size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, @@ -228,6 +222,10 @@ void TestLACPrediction(const std::string &model_path, } if (use_analysis) { + // run once for comparion as reference + auto ref_predictor = + CreatePaddlePredictor(config); + ref_predictor->Run(input_slots, &ref_outputs_slots); EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); auto &ref_out = ref_outputs_slots[0]; size_t ref_size = @@ -256,12 +254,9 @@ void TestLACPrediction(const std::string &model_path, } LOG(INFO) << "has num ops: " << num_ops; ASSERT_TRUE(fuse_statis.count("fc_fuse")); - ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); - LOG(INFO) << "fc fuse num:" << fuse_statis.at("fc_fuse"); - LOG(INFO) << "fc gru fuse num:" << fuse_statis.at("fc_gru_fuse"); - // ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); - // LOG(INFO) << fuse_statis.at("fc_gru_fuse"); + LOG(INFO) << "fc fuse num:" << fuse_statis.at("fc_fuse"); + // LOG(INFO) << "fc gru fuse num:" << fuse_statis.at("fc_gru_fuse"); } } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index a8fa677202..82d673fd15 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -22,12 +22,25 @@ #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_bool(profile); namespace paddle { bool AnalysisPredictor::Init( const std::shared_ptr& parent_scope) { VLOG(3) << "Predictor::init()"; +#if !defined(_WIN32) + if (FLAGS_profile) { + LOG(WARNING) << "Profiler is actived, might affect the performance"; + LOG(INFO) << "You can turn off by set gflags '-profile false'"; + auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll + : platform::ProfilerState::kCPU; + platform::EnableProfiler(tracking_device); + } +#endif + if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); LOG(WARNING) << "ir optimize only supports CPU currently"; From ca30127e0a048de5e56e249d54d8836422ac2140 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 7 Sep 2018 00:03:13 +0800 Subject: [PATCH 07/13] fix compile error undef registrar pass --- paddle/fluid/inference/analysis/analyzer.h | 1 - paddle/fluid/inference/api/CMakeLists.txt | 14 +++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 6189548a7b..399afbe64a 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -38,7 +38,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/flags.h" #include "paddle/fluid/inference/analysis/pass_manager.h" diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index b69948f40a..f944c9fdec 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -44,7 +44,19 @@ function(inference_api_test TARGET_NAME) endfunction(inference_api_test) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api + analysis + ir_pass_manager + pass + fc_fuse_pass + fc_lstm_fuse_pass + fc_gru_fuse_pass + seq_concat_fc_fuse_pass + graph_viz_pass + infer_clean_graph_pass + graph_pattern_detector + infer_clean_graph_pass + attention_lstm_fuse_pass) cc_test(test_paddle_inference_api SRCS api_tester.cc From 7eebb905235c5780350d92902776a4e0c267c87f Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 7 Sep 2018 11:53:19 +0800 Subject: [PATCH 08/13] fix conflicts --- paddle/fluid/inference/analysis/analyzer_lac_tester.cc | 2 +- paddle/fluid/inference/api/helper.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc index 5740faa746..7917152428 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc @@ -192,7 +192,7 @@ void TestLACPrediction(const std::string &model_path, sum += timer.toc(); } } - PrintTime(sum, batch_size, repeat); + PrintTime(batch_size, repeat, 1, 0, sum / batch_size); return; } timer.tic(); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 2c2ac656e8..0ab2542f34 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -124,9 +124,11 @@ std::string DescribeTensor(const PaddleTensor &tensor) { void PrintTime(int batch_size, int repeat, int num_threads, int tid, double latency) { + LOG(INFO) << "====================================="; LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat << ", threads: " << num_threads << ", thread id: " << tid << ", latency: " << latency << "ms"; + LOG(INFO) << "====================================="; } } // namespace inference From c9bd2d50f1d9c0db255ebc132b7c74438f3b3bba Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 7 Sep 2018 12:51:36 +0800 Subject: [PATCH 09/13] refine fc and gru pattern --- .../framework/ir/graph_pattern_detector.cc | 45 +++++++++---------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 37566b7621..69a323a8bd 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -519,50 +519,41 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) { PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope, PDNode* x, bool with_bias) { - // Create Operators - PDNode* elementwise_add_op{nullptr}; + // mul op auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul"); - if (with_bias) { - elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add") - ->assert_is_op("elementwise_add"); - } - // Create variables - // w auto* mul_weight_var = pattern->NewNode(name_scope, "w") ->AsInput() ->assert_is_persistable_var() - ->assert_is_op_nth_input("mul", "Y", 0); - PDNode* mul_out_var{nullptr}; + ->assert_is_op_input("mul", "Y"); + + PDNode* fc_out{nullptr}; if (with_bias) { + PDNode* elementwise_add_op{nullptr}; + PDNode *mul_out_var{nullptr}, *bias{nullptr}; + elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add") + ->assert_is_op("elementwise_add"); // intermediate variable, will be removed in the IR after fuse. mul_out_var = pattern->NewNode(name_scope, "mul_out") ->AsIntermediate() ->assert_is_only_output_of_op("mul") - ->assert_is_op_input("elementwise_add"); - } - PDNode *bias{nullptr}, *fc_out{nullptr}; - if (with_bias) { + ->assert_is_op_input("elementwise_add", "X"); // bias bias = pattern->NewNode(name_scope, "fc_bias") - ->assert_is_op_input("elementwise_add") - ->AsInput(); + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("elementwise_add", "Y"); // output fc_out = pattern->NewNode(name_scope, "fc_out") ->AsOutput() - ->assert_is_op_output("elementwise_add"); + ->assert_is_op_output("elementwise_add", "Out"); + mul_op->LinksFrom({x, mul_weight_var}).LinksTo({mul_out_var}); + elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out}); } else { fc_out = pattern->NewNode(name_scope, "fc_out") ->AsOutput() - ->assert_is_op_output("mul"); - } - - if (with_bias) { - mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var}); - elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out}); - } else { + ->assert_is_op_output("mul", "Out"); mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out}); } - return fc_out; } @@ -609,6 +600,10 @@ PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope, NEW_NODE(gru, BatchResetHiddenPrev, output); NEW_NODE(gru, BatchHidden, output); + BatchGate->AsIntermediate(); + BatchResetHiddenPrev->AsIntermediate(); + BatchHidden->AsIntermediate(); + gru_op->LinksFrom({x, Weight, Bias}); gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden}); return Hidden; From df0c695618696378c8320dd85661fdaa276e7407 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 7 Sep 2018 12:53:15 +0800 Subject: [PATCH 10/13] fix fusion gru pass and enable it --- paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 98 +++++++++++-------- .../inference/analysis/analyzer_lac_tester.cc | 1 - 2 files changed, 56 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc index 4a08beee7d..90d8d5c042 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -28,7 +28,7 @@ static void BuildPattern(PDPattern* pattern, const std::string& name_scope, auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias); fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse. patterns::GRU(pattern, name_scope, fc_out); - VLOG(3) << "\n" << pattern->DotString(); + VLOG(3) << "fc_gru pattern \n" << pattern->DotString(); } static int BuildFusion(Graph* graph, const std::string& name_scope, @@ -51,65 +51,72 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, OpDesc op_desc; op_desc.SetType("fusion_gru"); + +#define NEW_NAME(x) name_scope + "/at." #x ".new" #define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()}); SET_IN(X, x); SET_IN(WeightX, weight_x); SET_IN(WeightH, weight_h); - SET_IN(Bias, bias); + if (with_fc_bias) { + op_desc.SetInput("Bias", {NEW_NAME(bias) + bias_n->Name()}); + } else { + SET_IN(Bias, bias); + } #undef SET_IN + op_desc.SetInput("H0", {}); + op_desc.SetOutput("Hidden", {hidden_n->Name()}); + op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse")); + // TODO(TJ): This should be a option for infer + op_desc.SetAttr("use_seq", true); + +#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)}) + SET_IMTERMEDIATE_OUT(ReorderedH0); + SET_IMTERMEDIATE_OUT(XX); + SET_IMTERMEDIATE_OUT(BatchedInput); + SET_IMTERMEDIATE_OUT(BatchedOut); +#undef SET_IMTERMEDIATE_OUT + + auto* op = graph->CreateOpNode(&op_desc); + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto* scope = graph->Get(kParamScopeAttr); + PADDLE_ENFORCE(scope); if (with_fc_bias) { - // Add FC-bias with LSTM-bias and create a new weight - PADDLE_ENFORCE(scope); - const std::string& new_bias_var = name_scope + "_bias.new"; - auto* bias_var = scope->Var(new_bias_var); - PADDLE_ENFORCE(bias_var); - auto* bias_tensor = bias_var->GetMutable(); + // Fusion GRU bias = fcbias + grubias + auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias_n->Name()); + auto* out_bias_tensor = + fusion_bias_var->GetMutable(); + PADDLE_ENFORCE(fusion_bias_var); + GET_NODE(fc_bias); + PADDLE_ENFORCE(fc_bias_n); auto* gru_bias_var = scope->FindVar(bias_n->Name()); + auto* fc_bias_var = scope->FindVar(fc_bias_n->Name()); PADDLE_ENFORCE(gru_bias_var); + PADDLE_ENFORCE(fc_bias_var); const auto& gru_bias_tenosr = gru_bias_var->Get(); - bias_tensor->Resize(gru_bias_tenosr.dims()); - - GET_NODE(fc_bias); - auto* fc_bias_var = scope->FindVar(fc_bias_n->Name()); const auto& fc_bias_tensor = fc_bias_var->Get(); // new bias = fc bias + gru bias - auto* data = bias_tensor->mutable_data(platform::CPUPlace()); - for (int i = 0; i < bias_tensor->numel(); i++) { + out_bias_tensor->Resize(gru_bias_tenosr.dims()); + auto* data = out_bias_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < out_bias_tensor->numel(); i++) { data[i] = fc_bias_tensor.data()[i] + gru_bias_tenosr.data()[i]; } - op_desc.SetInput("Bias", {new_bias_var}); } #undef GET_NODE - op_desc.SetInput("H0", {}); - op_desc.SetOutput("Hidden", {hidden_n->Name()}); - op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse")); - // TODO(TJ): This should be a option for infer - op_desc.SetAttr("use_seq", true); - - // Create temp variables. - // TODO(TJ): clean code - scope->Var(name_scope + "/ReorderedH0.new") - ->GetMutable(); - scope->Var(name_scope + "/XX.new")->GetMutable(); - scope->Var(name_scope + "/BatchedInput.new") - ->GetMutable(); - scope->Var(name_scope + "/BatchedOut.new") - ->GetMutable(); - op_desc.SetOutput("ReorderedH0", {name_scope + "/ReorderedH0.new"}); - op_desc.SetOutput("XX", {name_scope + "/XX.new"}); - op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"}); - op_desc.SetOutput("BatchedOut", {name_scope + "/BatchedOut.new"}); - - auto* op = graph->CreateOpNode(&op_desc); - PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); - // auto* scope = graph->Get(kParamScopeAttr); +#define NEW_IMTERMEDIATE_OUT(key) \ + scope->Var(NEW_NAME(key))->GetMutable() + NEW_IMTERMEDIATE_OUT(ReorderedH0); + NEW_IMTERMEDIATE_OUT(XX); + NEW_IMTERMEDIATE_OUT(BatchedInput); + NEW_IMTERMEDIATE_OUT(BatchedOut); +#undef NEW_NAME +#undef NEW_IMTERMEDIATE_OUT IR_NODE_LINK_TO(x_n, op); IR_NODE_LINK_TO(weight_x_n, op); IR_NODE_LINK_TO(weight_h_n, op); - IR_NODE_LINK_TO(bias_n, op); + IR_NODE_LINK_TO(bias_n, op); // actually should link to new bias if have IR_NODE_LINK_TO(op, hidden_n); // h0? return op; @@ -127,26 +134,33 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, int name__ __attribute__((unused)) = name__##_n->id(); GET_NODE(x); - GET_NODE(w); + GET_NODE(w); // fc weight GET_NODE(mul); GET_NODE(fc_out); GET_NODE(Weight); GET_NODE(gru); GET_NODE(Bias); GET_NODE(Hidden); + // nodes need be removed + GET_NODE(BatchGate); + GET_NODE(BatchResetHiddenPrev); + GET_NODE(BatchHidden); if (with_fc_bias) { + GET_NODE(mul_out); GET_NODE(fc_bias); GET_NODE(elementwise_add); gru_creater(gru, x, w, Weight, Bias, Hidden, fc_bias); // Remove unneeded nodes. std::unordered_set marked_nodes( - {mul_n, gru_n, elementwise_add_n}); + {mul_n, gru_n, elementwise_add_n, fc_bias_n, fc_out_n, mul_out_n, + BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n}); GraphSafeRemoveNodes(graph, marked_nodes); } else { gru_creater(gru, x, w, Weight, Bias, Hidden, -1); // Remove unneeded nodes. - std::unordered_set marked_nodes({mul_n, gru_n}); + std::unordered_set marked_nodes( + {mul_n, gru_n, BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n}); GraphSafeRemoveNodes(graph, marked_nodes); } #undef GET_NODE diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc index 7917152428..56f773bf21 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc @@ -171,7 +171,6 @@ void TestLACPrediction(const std::string &model_path, cfg.device = 0; cfg.specify_input_name = true; cfg.enable_ir_optim = true; - cfg.ir_passes.push_back("fc_gru_fuse_pass"); predictor = CreatePaddlePredictor(cfg); } else { From acfdbf029330e60037e4fff7cee9c00d99f031c5 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 7 Sep 2018 15:56:51 +0800 Subject: [PATCH 11/13] enable ner analysis test and refine lac --- .../fluid/inference/analysis/CMakeLists.txt | 2 +- .../inference/analysis/analyzer_lac_tester.cc | 14 ++-- .../inference/analysis/analyzer_ner_tester.cc | 74 ++++++++++++++++--- .../inference/analysis/analyzer_tester.cc | 2 - paddle/fluid/inference/api/CMakeLists.txt | 15 +--- paddle/fluid/inference/api/helper.h | 6 +- 6 files changed, 74 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index ef55a0c28a..a115bc8f4a 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -81,7 +81,7 @@ if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) endif() inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api + EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc index 56f773bf21..4ff7251473 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc @@ -15,11 +15,9 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/platform/profiler.h" DEFINE_string(infer_model, "", "model path for LAC"); @@ -160,7 +158,7 @@ void TestLACPrediction(const std::string &model_path, config.use_gpu = false; config.device = 0; config.specify_input_name = true; - std::vector input_slots, outputs_slots, ref_outputs_slots; + std::vector input_slots, outputs_slots; DataRecord data(data_file, batch_size); GetOneBatch(&input_slots, &data, batch_size); std::unique_ptr predictor; @@ -217,6 +215,7 @@ void TestLACPrediction(const std::string &model_path, // run once for comparion as reference auto ref_predictor = CreatePaddlePredictor(config); + std::vector ref_outputs_slots; ref_predictor->Run(input_slots, &ref_outputs_slots); EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); auto &ref_out = ref_outputs_slots[0]; @@ -246,9 +245,10 @@ void TestLACPrediction(const std::string &model_path, } LOG(INFO) << "has num ops: " << num_ops; ASSERT_TRUE(fuse_statis.count("fc_fuse")); - // ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); - LOG(INFO) << "fc fuse num:" << fuse_statis.at("fc_fuse"); - // LOG(INFO) << "fc gru fuse num:" << fuse_statis.at("fc_gru_fuse"); + ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4); + EXPECT_EQ(num_ops, 11); } } diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc index eaae09b051..f5c5d73aeb 100644 --- a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc @@ -13,12 +13,11 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/analyzer.h" -#include #include -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/platform/profiler.h" DEFINE_string(infer_model, "", "model path"); @@ -112,7 +111,7 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, 48, 39, 38, 16, 25}; -void TestChineseNERPrediction() { +void TestChineseNERPrediction(bool use_analysis) { NativeConfig config; config.prog_file = FLAGS_infer_model + "/__model__"; config.param_file = FLAGS_infer_model + "/param"; @@ -120,11 +119,23 @@ void TestChineseNERPrediction() { config.device = 0; config.specify_input_name = true; - auto predictor = - CreatePaddlePredictor(config); - std::vector input_slots; - std::vector outputs; + std::vector input_slots, outputs; + std::unique_ptr predictor; Timer timer; + if (use_analysis) { + AnalysisConfig cfg; + cfg.prog_file = FLAGS_infer_model + "/__model__"; + cfg.param_file = FLAGS_infer_model + "/param"; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; + predictor = + CreatePaddlePredictor(cfg); + } else { + predictor = + CreatePaddlePredictor(config); + } if (FLAGS_test_all_data) { LOG(INFO) << "test all data"; @@ -165,10 +176,51 @@ void TestChineseNERPrediction() { for (size_t i = 0; i < std::min(11UL, size); i++) { PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]); } + + if (use_analysis) { + // run once for comparion as reference + auto ref_predictor = + CreatePaddlePredictor(config); + std::vector ref_outputs_slots; + ref_predictor->Run(input_slots, &ref_outputs_slots); + EXPECT_EQ(ref_outputs_slots.size(), outputs.size()); + auto &ref_out = ref_outputs_slots[0]; + size_t ref_size = + std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, + [](int a, int b) { return a * b; }); + EXPECT_EQ(size, ref_size); + int64_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t i = 0; i < size; ++i) { + EXPECT_EQ(pdata_ref[i], result[i]); + } + + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2); + EXPECT_EQ(num_ops, 14); + } } -// Directly infer with the original model. -TEST(Analyzer, Chinese_ner) { TestChineseNERPrediction(); } +TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); } + +TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); } } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 4cf26d3c70..a496ae41aa 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -283,7 +283,6 @@ void TestDituRNNPrediction(bool use_analysis, bool activate_ir, base_predictor->Run(input_slots, &base_outputs); - LOG(INFO) << "===========profile result==========="; if (num_threads == 1) { // Prepare inputs. Timer timer; @@ -324,7 +323,6 @@ void TestDituRNNPrediction(bool use_analysis, bool activate_ir, threads[i].join(); } } - LOG(INFO) << "====================================="; if (use_analysis && activate_ir) { AnalysisPredictor *analysis_predictor = diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index f944c9fdec..5df486f345 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -44,20 +44,7 @@ function(inference_api_test TARGET_NAME) endfunction(inference_api_test) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api - analysis - ir_pass_manager - pass - fc_fuse_pass - fc_lstm_fuse_pass - fc_gru_fuse_pass - seq_concat_fc_fuse_pass - graph_viz_pass - infer_clean_graph_pass - graph_pattern_detector - infer_clean_graph_pass - attention_lstm_fuse_pass) - +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 0ab2542f34..f6893be428 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -124,11 +124,9 @@ std::string DescribeTensor(const PaddleTensor &tensor) { void PrintTime(int batch_size, int repeat, int num_threads, int tid, double latency) { - LOG(INFO) << "====================================="; - LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat + LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat << ", threads: " << num_threads << ", thread id: " << tid - << ", latency: " << latency << "ms"; - LOG(INFO) << "====================================="; + << ", latency: " << latency << "ms ======"; } } // namespace inference From 3ea19b759649feabf45860e4e4c808c26845c3c7 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 7 Sep 2018 18:48:45 +0800 Subject: [PATCH 12/13] fix bug and fc pass ut --- paddle/fluid/framework/ir/graph_pattern_detector.cc | 9 ++++----- paddle/fluid/inference/analysis/analyzer_lac_tester.cc | 1 + paddle/fluid/inference/analysis/analyzer_ner_tester.cc | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 69a323a8bd..5ca7509515 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -536,22 +536,21 @@ PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope, mul_out_var = pattern->NewNode(name_scope, "mul_out") ->AsIntermediate() ->assert_is_only_output_of_op("mul") - ->assert_is_op_input("elementwise_add", "X"); + ->assert_is_op_input("elementwise_add"); // bias bias = pattern->NewNode(name_scope, "fc_bias") ->AsInput() - ->assert_is_persistable_var() - ->assert_is_op_input("elementwise_add", "Y"); + ->assert_is_op_input("elementwise_add"); // output fc_out = pattern->NewNode(name_scope, "fc_out") ->AsOutput() - ->assert_is_op_output("elementwise_add", "Out"); + ->assert_is_op_output("elementwise_add"); mul_op->LinksFrom({x, mul_weight_var}).LinksTo({mul_out_var}); elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out}); } else { fc_out = pattern->NewNode(name_scope, "fc_out") ->AsOutput() - ->assert_is_op_output("mul", "Out"); + ->assert_is_op_output("mul"); mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out}); } return fc_out; diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc index 4ff7251473..b906b32cf5 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc index f5c5d73aeb..661b047ed7 100644 --- a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" From 5a2fc5b52f20b0c905a38ebd0fe206f88dadd649 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 7 Sep 2018 22:54:42 +0800 Subject: [PATCH 13/13] fix print error --- paddle/fluid/inference/analysis/analyzer_lac_tester.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc index b906b32cf5..522d870db8 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc @@ -182,6 +182,7 @@ void TestLACPrediction(const std::string &model_path, Timer timer; if (test_all_data) { double sum = 0; + LOG(INFO) << "Total number of samples: " << data.datasets.size(); for (int i = 0; i < repeat; i++) { for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { GetOneBatch(&input_slots, &data, batch_size); @@ -190,7 +191,9 @@ void TestLACPrediction(const std::string &model_path, sum += timer.toc(); } } - PrintTime(batch_size, repeat, 1, 0, sum / batch_size); + PrintTime(batch_size, repeat, 1, 0, sum / repeat); + LOG(INFO) << "Average latency of each sample: " + << sum / repeat / data.datasets.size() << " ms"; return; } timer.tic();