From eaea82fbabd5e6a84bddd5467bed70eed42ccdba Mon Sep 17 00:00:00 2001 From: chenweihang Date: Thu, 26 Jul 2018 03:01:42 +0000 Subject: [PATCH 01/22] feat: Add memory usage estimate function --- python/paddle/fluid/contrib/__init__.py | 4 +- .../paddle/fluid/contrib/memory_usage_calc.py | 99 +++++++++++++++++++ 2 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 python/paddle/fluid/contrib/memory_usage_calc.py diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index 12cd5d918e..9de9e95045 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -14,5 +14,7 @@ import decoder from decoder import * +import memory_usage_calc +from memory_usage_calc import * -__all__ = decoder.__all__ +__all__ = decoder.__all__ + memory_usage_calc.__all__ diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py new file mode 100644 index 0000000000..7b9c7afee0 --- /dev/null +++ b/python/paddle/fluid/contrib/memory_usage_calc.py @@ -0,0 +1,99 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module privides a memory usage calculate function for user. +The purpose of this API is to allow users to estimate memory usage of +a program under a special batch size, then user can set appropriate +batch size to fully utilize a GPU. + +This API is still under active development and may change drastically. +""" + +from .. import core +from ..framework import Program, Variable + +__all__ = ['MemoryInfo'] + +DEBUG = False + +dtype_to_size = { + core.VarDesc.VarType.FP16: 2, + core.VarDesc.VarType.FP32: 4, + core.VarDesc.VarType.FP64: 8, + core.VarDesc.VarType.INT16: 2, + core.VarDesc.VarType.INT32: 4, + core.VarDesc.VarType.INT64: 8, + core.VarDesc.VarType.BOOL: 1, + core.VarDesc.VarType.UINT8: 1, +} + + +class MemoryInfo(object): + def __init__(self, program): + if not isinstance(program, Program): + raise TypeError( + "Calculating Memory Usage requires Program as its Parameter." + "But you passed in %s" % (type(prgram))) + self._program = program + + def _has_var(self, block, var_name): + return block.has_var(str(var_name)) + + def _find_var(self, block, var_name): + return block.var(str(var_name)) + + def get_memory_usage(self, batch_size, with_details=False): + + # get the first block of program + first_block = self._program.global_block() + + # get the var_name list of first block + # TODO(chenweihang): not find the API get block's var list directly + total_memory = 0.0 + for var in self._program.list_vars(): + if DEBUG: + print "All Block's Var: %s" % (var.name) + # TODO(chenweihang): why not used program.list_vars() + # calculate all variable's memory directly? + if self._has_var(first_block, var.name): + if DEBUG: + print "First Block's Var: %s" % (var.name) + print "Var's shape: ", var.shape + print "Var's dtype: ", var.dtype + data_count = 1 + for x in var.shape: + if x == -1: + data_count *= batch_size + else: + data_count *= x + var_memory = data_count * dtype_to_size[var.dtype] + if DEBUG: + print "Var's memory: %d" % (var_memory) + total_memory += var_memory + + # Convert unit and make result string + result_str = "- With current batch size, memory usage is about " + unit_str = " B." + if total_memory > 1024: + total_memory /= 1024 + unit_str = " KB." + if total_memory > 1024: + total_memory /= 1024 + unit_str = " MB." + + # Append extra memory consumption (5% - 10%) + result_str += str(round(total_memory * 1.05, 3)) + " - " \ + + str(round(total_memory * 1.10, 3)) + unit_str + + return result_str From ea548a794b6f4819b51aa00a16feb817f8563cdb Mon Sep 17 00:00:00 2001 From: chenweihang Date: Thu, 26 Jul 2018 05:58:07 +0000 Subject: [PATCH 02/22] refactor: simplify class to function --- .../paddle/fluid/contrib/memory_usage_calc.py | 105 +++++++++--------- 1 file changed, 54 insertions(+), 51 deletions(-) diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py index 7b9c7afee0..5da846edb6 100644 --- a/python/paddle/fluid/contrib/memory_usage_calc.py +++ b/python/paddle/fluid/contrib/memory_usage_calc.py @@ -23,9 +23,7 @@ This API is still under active development and may change drastically. from .. import core from ..framework import Program, Variable -__all__ = ['MemoryInfo'] - -DEBUG = False +__all__ = ['memory_usage'] dtype_to_size = { core.VarDesc.VarType.FP16: 2, @@ -38,62 +36,67 @@ dtype_to_size = { core.VarDesc.VarType.UINT8: 1, } +DEBUG = False -class MemoryInfo(object): - def __init__(self, program): - if not isinstance(program, Program): - raise TypeError( - "Calculating Memory Usage requires Program as its Parameter." - "But you passed in %s" % (type(prgram))) - self._program = program - def _has_var(self, block, var_name): - return block.has_var(str(var_name)) +def memory_usage(program, batch_size): + """ + Get the estimate memory usage of program with input batch size. - def _find_var(self, block, var_name): - return block.var(str(var_name)) + Args: + program(Program): The current Program. + batch_size(int): The current input data batch_size. + + Returns: + min_total_memory(float): the estimate memory usage lower bound. + max_total_memory(float): the estimate memory usage upper bound. + unit_str(string): the unit of estimate usage result. + + Examples: + + >>> import paddle.fluid as fluid + >>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage( + fluid.default_main_program(), batch_size=10) + >>> print "memory usage is about %.3f - %.3f %s" % \ + (lower_usage, upper_usage, unit) - def get_memory_usage(self, batch_size, with_details=False): + """ - # get the first block of program - first_block = self._program.global_block() + # Parameters check + if not isinstance(program, Program): + raise TypeError( + "Calculating Memory Usage requires Program as its Parameter." + "But you passed in %s" % (type(prgram))) + if batch_size <= 0: + raise ValueError("The batch size need to be positive.") - # get the var_name list of first block - # TODO(chenweihang): not find the API get block's var list directly - total_memory = 0.0 - for var in self._program.list_vars(): - if DEBUG: - print "All Block's Var: %s" % (var.name) - # TODO(chenweihang): why not used program.list_vars() - # calculate all variable's memory directly? - if self._has_var(first_block, var.name): - if DEBUG: - print "First Block's Var: %s" % (var.name) - print "Var's shape: ", var.shape - print "Var's dtype: ", var.dtype - data_count = 1 - for x in var.shape: - if x == -1: - data_count *= batch_size - else: - data_count *= x - var_memory = data_count * dtype_to_size[var.dtype] - if DEBUG: - print "Var's memory: %d" % (var_memory) - total_memory += var_memory + # Get the var_name list of first block and calculate + total_memory = 0.0 + for var in program.global_block().vars.itervalues(): + data_count = 1 + for x in var.shape: + if x == -1: + data_count *= batch_size + else: + data_count *= x + var_memory = data_count * dtype_to_size[var.dtype] + if DEBUG: + print "%s memory usage: %d" % (var.name, var_memory) + total_memory += var_memory + if DEBUG: + print "total memory usage: %.2f" % (total_memory) - # Convert unit and make result string - result_str = "- With current batch size, memory usage is about " - unit_str = " B." + # Convert appropriate unit + unit_str = "B" + if total_memory > 1024: + total_memory /= 1024 + unit_str = "KB" if total_memory > 1024: total_memory /= 1024 - unit_str = " KB." - if total_memory > 1024: - total_memory /= 1024 - unit_str = " MB." + unit_str = "MB" - # Append extra memory consumption (5% - 10%) - result_str += str(round(total_memory * 1.05, 3)) + " - " \ - + str(round(total_memory * 1.10, 3)) + unit_str + # Append extra memory consumption (5% - 10%) + min_total_memory = total_memory * 1.05 + max_total_memory = total_memory * 1.1 - return result_str + return min_total_memory, max_total_memory, unit_str From 59c900e1e95a6a05dbba0e1d3aad9b2dc6c18419 Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Tue, 31 Jul 2018 11:35:12 +0800 Subject: [PATCH 03/22] Update API.spec --- paddle/fluid/API.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 5f3bfa2965..56c3a290db 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -336,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) From 964d631cfdc39eb02bdf6fcfee9ae9ed24dcdfa0 Mon Sep 17 00:00:00 2001 From: chenweihang Date: Tue, 31 Jul 2018 11:54:59 +0000 Subject: [PATCH 04/22] feat: add unittest of memory usage --- .../tests/unittests/test_memory_usage.py | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_memory_usage.py diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py new file mode 100644 index 0000000000..c1e286d327 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py @@ -0,0 +1,80 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import paddle +import paddle.fluid as fluid +import contextlib +import unittest + + +def train_simulator(use_cuda, test_batch_size=10): + if test_batch_size <= 0: + raise ValueError("batch_size should be a positive integeral value, " + "but got batch_size={}".format(test_batch_size)) + + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_cost) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.train(), buf_size=500), + batch_size=test_batch_size) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + lower_usage, upper_usage, unit = fluid.contrib.memory_usage( + fluid.default_main_program(), batch_size=test_batch_size) + + print("memory usage is about %.3f - %.3f %s" % + (lower_usage, upper_usage, unit)) + + +class TestMemoryUsage(unittest.TestCase): + def test_cpu(self): + with self.program_scope_guard(): + train_simulator(use_cuda=False) + + def test_cpu_with_unit_KB(self): + with self.program_scope_guard(): + train_simulator(use_cuda=False, test_batch_size=1000) + + def test_cpu_with_unit_MB(self): + with self.program_scope_guard(): + train_simulator(use_cuda=False, test_batch_size=100000) + + def test_cuda(self): + with self.program_scope_guard(): + train_simulator(use_cuda=True) + + @contextlib.contextmanager + def program_scope_guard(self): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +if __name__ == '__main__': + unittest.main() From 8627ef3330e7d0740ab1267ca5fdc0ee9268135d Mon Sep 17 00:00:00 2001 From: chenweihang Date: Wed, 1 Aug 2018 03:19:57 +0000 Subject: [PATCH 05/22] refactor: simplify unittest function --- .../tests/unittests/test_memory_usage.py | 27 ++++++------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py index c1e286d327..f9daf83652 100644 --- a/python/paddle/fluid/tests/unittests/test_memory_usage.py +++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py @@ -19,7 +19,7 @@ import contextlib import unittest -def train_simulator(use_cuda, test_batch_size=10): +def train_simulator(test_batch_size=10): if test_batch_size <= 0: raise ValueError("batch_size should be a positive integeral value, " "but got batch_size={}".format(test_batch_size)) @@ -34,14 +34,7 @@ def train_simulator(use_cuda, test_batch_size=10): sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.uci_housing.train(), buf_size=500), - batch_size=test_batch_size) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - + # Calculate memory usage in current network config lower_usage, upper_usage, unit = fluid.contrib.memory_usage( fluid.default_main_program(), batch_size=test_batch_size) @@ -50,21 +43,17 @@ def train_simulator(use_cuda, test_batch_size=10): class TestMemoryUsage(unittest.TestCase): - def test_cpu(self): - with self.program_scope_guard(): - train_simulator(use_cuda=False) - - def test_cpu_with_unit_KB(self): + def test_with_unit_B(self): with self.program_scope_guard(): - train_simulator(use_cuda=False, test_batch_size=1000) + train_simulator() - def test_cpu_with_unit_MB(self): + def test_with_unit_KB(self): with self.program_scope_guard(): - train_simulator(use_cuda=False, test_batch_size=100000) + train_simulator(test_batch_size=1000) - def test_cuda(self): + def test_with_unit_MB(self): with self.program_scope_guard(): - train_simulator(use_cuda=True) + train_simulator(test_batch_size=100000) @contextlib.contextmanager def program_scope_guard(self): From 32a9e050bc25547e40d225b055191ea22287de2d Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 7 Aug 2018 17:07:13 +0000 Subject: [PATCH 06/22] mapping the variable name inside the subgraph --- paddle/fluid/inference/analysis/analyzer.cc | 15 +++- .../analysis/data_flow_graph_to_fluid_pass.cc | 84 +++++++++++++++++-- .../inference/analysis/subgraph_splitter.cc | 2 +- .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../inference/tensorrt/convert/op_converter.h | 3 +- paddle/fluid/operators/tensorrt_engine_op.cc | 20 ++--- paddle/fluid/operators/tensorrt_engine_op.h | 16 +++- 7 files changed, 112 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 98bdfcc00b..c4ab26a228 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -24,7 +24,7 @@ namespace paddle { -DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, +DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true, "Enable subgraph to TensorRT engine for acceleration"); DEFINE_string(inference_analysis_graphviz_log_root, "./", @@ -42,10 +42,19 @@ class DfgPassManagerImpl final : public DfgPassManager { // TODO(Superjomn) set the key with pass reprs. AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) { - auto trt_teller = [](const Node* node) { + auto trt_teller = [&](const Node* node) { + std::unordered_set teller_set( + {"elementwise_add", "mul", "conv2d", "pool2d", "relu"}); if (!node->IsFunction()) return false; - return static_cast(node)->func_type() == "mul"; + + const auto* func = static_cast(node); + if (teller_set.count(func->func_type())) + return true; + else { + return false; + } }; + AddPass("tensorrt-subgraph-marker", new TensorRTSubgraphNodeMarkPass(trt_teller)); AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 2328d87042..8eac84343d 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -23,7 +23,7 @@ namespace paddle { namespace inference { -DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size"); +DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size"); DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size"); namespace analysis { @@ -87,27 +87,90 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { } void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, - const framework::proto::BlockDesc &block) { + framework::proto::BlockDesc &block) { static int counter{0}; PADDLE_ENFORCE(node->IsFunctionBlock()); framework::OpDesc desc; auto *func = static_cast(node); // collect inputs - std::vector io; + std::unordered_set input_names; for (auto *x : func->inlinks) { - io.push_back(x->name()); + input_names.insert(x->name()); } - desc.SetInput("Xs", io); + desc.SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); - // collect outputs - io.clear(); + std::unordered_set output_names; for (auto *x : func->outlinks) { - io.push_back(x->name()); + output_names.insert(x->name()); } - desc.SetOutput("Ys", io); + + std::vector output_temp(output_names.begin(), + output_names.end()); + desc.SetOutput("Ys", output_temp); desc.SetType("tensorrt_engine"); + std::unordered_map output_name_map; + auto subgraph_nodes = func->subgraph; + + for (int index = 0; index < block.ops_size(); index++) { + framework::proto::OpDesc *op = block.mutable_ops(index); + // auto &op = block.mutable_ops(index); + auto correspond_node = subgraph_nodes[index]; + PADDLE_ENFORCE_EQ(correspond_node->name(), op->type()); + + std::unordered_map var2id; + for (auto *in_var : correspond_node->inlinks) { + var2id[in_var->name()] = in_var->id(); + } + // TODO(zhaolong): add comments + for (int i = 0; i < op->inputs_size(); i++) { + framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i); + // auto &in_var = op->mutable_inputs(i); + std::vector replaced_names; + for (int k = 0; k < in_var->arguments_size(); k++) { + std::string arg_value = in_var->arguments(k); + if (input_names.count(arg_value)) { + replaced_names.push_back(arg_value); + } else { + replaced_names.push_back(arg_value + + std::to_string(var2id[arg_value])); + } + } + in_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + in_var->add_arguments(replaced_names[k]); + } + } + var2id.clear(); + for (auto out_var : correspond_node->outlinks) { + var2id[out_var->name()] = out_var->id(); + } + + for (int i = 0; i < op->outputs_size(); i++) { + framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); + std::vector replaced_names; + for (int k = 0; k < out_var->arguments_size(); k++) { + std::string arg_value = out_var->arguments(k); + if (output_names.count(arg_value)) { + output_name_map[arg_value] = + arg_value + std::to_string(var2id[arg_value]); + } + replaced_names.push_back(arg_value + std::to_string(var2id[arg_value])); + } + out_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + out_var->add_arguments(replaced_names[k]); + } + } + } + std::vector output_mapping; + for (auto name : output_names) { + PADDLE_ENFORCE(output_name_map.count(name) != 0); + output_mapping.push_back(output_name_map[name]); + } + PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc"); // Set attrs SetAttr(desc.Proto(), "subgraph", block.SerializeAsString()); @@ -115,6 +178,7 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize); SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); + SetAttr(desc.Proto(), "output_name_mapping", output_mapping); node->SetPbMsg(desc.Proto()->SerializeAsString()); } @@ -146,11 +210,13 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { LOG(INFO) << "transformed variable size: " << block_desc.Proto()->vars().size(); // copy ops. + for (auto *node : block_node->subgraph) { auto *op = block_desc.AppendOp(); PADDLE_ENFORCE(!node->pb_msg().empty()); op->Proto()->ParseFromString(node->pb_msg()); } + *block_desc.Proto()->mutable_vars() = argument_->origin_program_desc->blocks(0).vars(); PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 389f9e1a91..80809d4c43 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { std::vector> SubGraphSplitter::ExtractSubGraphs() { std::vector marked_nodes; - for (auto &node : GraphTraits(graph_).nodes()) { + for (auto &node : GraphTraits(graph_).nodes_in_TS()) { if (node.attr(kMarkerAttrName).Bool()) { marked_nodes.push_back(&node); } diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index d86c046f2e..8f42a37cd3 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,6 +1,7 @@ # Add TRT tests nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc +activation_op.cc DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 1b6a0ad82f..41faaf7212 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -55,7 +55,6 @@ class OpConverter { it = Registry::Lookup("fc"); } } - if (op_desc.Type().find("elementwise") != std::string::npos) { static std::unordered_set add_tensor_op_set{ "add", "mul", "sub", "div", "max", "min", "pow"}; @@ -72,6 +71,8 @@ class OpConverter { "Unsupported elementwise type" + op_type); it = Registry::Lookup("elementwise_" + op_type + "_weight"); + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); } else { PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0, "Unsupported elementwise type" + op_type); diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 1172822e12..ee3078876c 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -55,18 +55,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, "TensorRT' tensor input requires at most 4 dimensions"); - - switch (shape.size()) { - case 2: - return nvinfer1::Dims2(1, shape[1]); - case 3: - return nvinfer1::Dims3(1, shape[1], shape[2]); - case 4: - return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]); - default: - return nvinfer1::Dims(); - } - return nvinfer1::Dims(); + PADDLE_ENFORCE_EQ(shape.size(), 4UL); + return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); } } // namespace @@ -86,6 +76,9 @@ void TensorRTEngineKernel::Prepare( parameters.insert(param); } + std::vector output_maps = + context.Attr>("output_name_mapping"); + // TODO(Superjomn) replace this with a different stream auto *engine = Singleton::Global().Create( max_batch, max_workspace, nullptr /*engine hold its own stream*/, @@ -97,6 +90,7 @@ void TensorRTEngineKernel::Prepare( // Add inputs VLOG(4) << "declare inputs"; for (auto &input : context.Inputs("Xs")) { + if (parameters.count(input)) continue; VLOG(4) << "declare input " << input; auto *var = block.FindVar(input); // TensorRT engine need to create parameters. The parameter's description @@ -122,7 +116,7 @@ void TensorRTEngineKernel::Prepare( block_desc, parameters, context.scope(), engine); // Add outputs - for (auto &output : context.Outputs("Ys")) { + for (auto &output : output_maps) { engine->DeclareOutput(output); } diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 32d10fd8a5..2cbe1213a2 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -66,8 +66,17 @@ class TensorRTEngineKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, context.Attr("max_batch")); + std::vector output_maps = + context.Attr>("output_name_mapping"); + + auto params = context.Attr>("parameters"); + std::unordered_set parameters; + for (const auto& param : params) { + parameters.insert(param); + } // Convert input tensor from fluid to engine. for (const auto& x : context.Inputs("Xs")) { + if (parameters.count(x)) continue; // convert input and copy to TRT engine's buffer auto& t = inference::analysis::GetFromScope( context.scope(), x); @@ -82,10 +91,12 @@ class TensorRTEngineKernel : public framework::OpKernel { // Execute the engine. PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0); engine->Execute(FLAGS_tensorrt_engine_batch_size); + // Convert output tensor from engine to fluid + int output_index = 0; for (const auto& y : context.Outputs("Ys")) { // convert output and copy to fluid. - nvinfer1::ITensor* trt_t = engine->GetITensor(y); + nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); auto dims = trt_t->getDimensions(); // Use the output ITensor's dims to reshape the Fluid Tensor. std::vector ddim(dims.d, dims.d + dims.nbDims); @@ -102,7 +113,7 @@ class TensorRTEngineKernel : public framework::OpKernel { // TODO(Superjomn) change this float to dtype size. auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * FLAGS_tensorrt_engine_batch_size; - engine->GetOutputInCPU(y, + engine->GetOutputInCPU(output_maps[output_index], fluid_t->mutable_data(platform::CPUPlace()), size * sizeof(float)); //} else { @@ -110,6 +121,7 @@ class TensorRTEngineKernel : public framework::OpKernel { // y, fluid_t->mutable_data(platform::CUDAPlace()), // size * sizeof(float)); //} + output_index += 1; } cudaStreamSynchronize(*engine->stream()); From b4d48531e479b233b409a223a2c5bb4ffabe40ef Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Wed, 8 Aug 2018 10:04:50 +0800 Subject: [PATCH 07/22] optimize vlog before and after op run, move into op.run --- paddle/fluid/framework/executor.cc | 5 ----- paddle/fluid/framework/operator.cc | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index c2800c972a..dad170ed78 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -330,12 +330,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } for (auto& op : ctx->ops_) { - VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); - // NOTE! Please do not delete this line, it's usefull because the debug - // string before and after op.run are different, after run the output - // will have right shape which is usefull for debug. - VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 0c8acf71bf..130f3fe3e5 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -127,7 +127,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { - VLOG(10) << "- " << DebugStringEx(&scope); + VLOG(4) << place << " " << DebugStringEx(&scope); if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("Cannot run operator on place %s", place); @@ -139,7 +139,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(Type(), pool.Get(place)); RunImpl(scope, place); - VLOG(10) << "+ " << DebugStringEx(&scope); + VLOG(3) << place << " " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { From e51d045a6d7add2d8bf765e834a10dee76f5117c Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 8 Aug 2018 03:25:43 +0000 Subject: [PATCH 08/22] modify trt engine op test --- paddle/fluid/operators/tensorrt_engine_op_test.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index 7cb1e47a15..b6e285dc5d 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -103,6 +103,8 @@ TEST(TensorRTEngineOp, manual) { SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr>(engine_op_desc.Proto(), "parameters", std::vector({})); + SetAttr>(engine_op_desc.Proto(), "output_name_mapping", + std::vector({"z0"})); LOG(INFO) << "create engine op"; auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); @@ -196,6 +198,9 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { std::vector({"y0", "y1", "y2", "y3"})); SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "b_engine"); + SetAttr>(engine_op_desc.Proto(), "output_name_mapping", + std::vector({"z3"})); + auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); // Execute them. From 3a0caf801fbf499216b92e32dc63bae4260dd8b6 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 8 Aug 2018 03:32:12 +0000 Subject: [PATCH 09/22] modify trt engine op test --- paddle/fluid/operators/tensorrt_engine_op_test.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index b6e285dc5d..37657fa0b0 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -103,7 +103,8 @@ TEST(TensorRTEngineOp, manual) { SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr>(engine_op_desc.Proto(), "parameters", std::vector({})); - SetAttr>(engine_op_desc.Proto(), "output_name_mapping", + SetAttr>(engine_op_desc.Proto(), + "output_name_mapping", std::vector({"z0"})); LOG(INFO) << "create engine op"; @@ -198,7 +199,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { std::vector({"y0", "y1", "y2", "y3"})); SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "b_engine"); - SetAttr>(engine_op_desc.Proto(), "output_name_mapping", + SetAttr>(engine_op_desc.Proto(), + "output_name_mapping", std::vector({"z3"})); auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); From 7c8b69c700a19d777f27ae0495cd2129d3156a9c Mon Sep 17 00:00:00 2001 From: chengduo Date: Wed, 8 Aug 2018 14:01:12 +0800 Subject: [PATCH 10/22] Feature/op fusion (#12240) * Add Preface * Add demo code * Save file * Refine code * seems can work * use elementwise strategy * Use ElementwiseComputeEx * Add comments * extract functions from operator * Refine code * Follow comment * code refine * follow comments * follow comments --- doc/survey/op_fusion_design.md | 20 + .../operators/fused_elemwise_activation_op.cc | 221 +++++ .../operators/fused_elemwise_activation_op.cu | 30 + .../operators/fused_elemwise_activation_op.h | 425 +++++++++ paddle/fluid/operators/math/functors.h | 71 ++ .../tests/unittests/test_activation_op.py | 4 +- .../test_fused_elemwise_activation_op.py | 818 ++++++++++++++++++ 7 files changed, 1587 insertions(+), 2 deletions(-) create mode 100644 doc/survey/op_fusion_design.md create mode 100644 paddle/fluid/operators/fused_elemwise_activation_op.cc create mode 100644 paddle/fluid/operators/fused_elemwise_activation_op.cu create mode 100644 paddle/fluid/operators/fused_elemwise_activation_op.h create mode 100644 paddle/fluid/operators/math/functors.h create mode 100644 python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py diff --git a/doc/survey/op_fusion_design.md b/doc/survey/op_fusion_design.md new file mode 100644 index 0000000000..d6e48f4f58 --- /dev/null +++ b/doc/survey/op_fusion_design.md @@ -0,0 +1,20 @@ +# Operator fusion +Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory. + +There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules. + +## Challenge +The challenge of fusing operators is: + - how to make the rules. + - how to implement these rules efficiently. + +### How to make the rules? + +The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first. + +### How to implement these rules efficiently? +#### How to fuse the adjacent operations efficiently? +Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate. + +#### How to fuse the operators that have the same function efficiently? +We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`. diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused_elemwise_activation_op.cc new file mode 100644 index 0000000000..a6fd0aeb02 --- /dev/null +++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc @@ -0,0 +1,221 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/fused_elemwise_activation_op.h" + +namespace paddle { +namespace operators { + +class FusedElemwiseActivationOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("X"), + "Input(X) of FusedElemwiseActivationOp op should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("Y"), + "Input(Y) of FusedElemwiseActivationOp op should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FusedElemwiseActivationOp op should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + + ctx->SetOutputDim("Out", x_dim); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(ctx.Input("X")->type(), + ctx.Input("Y")->type(), + "The element's type of input should be the same."); + auto input_data_type = + framework::ToDataType(ctx.Input("X")->type()); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + +class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(vector)"); + AddInput("Y", "(vector)"); + AddOutput("Out", "vector"); + AddAttr("axis", + "axis is used by elementwise_op, the default value is -1.") + .SetDefault(-1); + AddAttr("scale", + "scale is used by scale_op, the default value is 0.0.") + .SetDefault(0.0); + AddAttr("recomputation", + "Whether to recompute the Out." + "fused_elemwise_activation_grad has two methods to get the " + "dx and dy, one " + "is to use the 'Out', and the other is not to use it. " + "The former method will save the time of recomputing the " + "'Out', but it must occupy the memory to store the 'out'. " + "While, the later method can avoid occupying the memory, " + "but it must recompute the 'Out'. The default value is true.") + .SetDefault(true); + AddAttr>("functor_list", + "The functors that should be fused.") + .AddCustomChecker([&](const std::vector &functor_list) { + PADDLE_ENFORCE(ValidCheck(functor_list)); + }); + + AddComment(R"DOC( +FusedElemwiseActivation Operator. + +At present, FusedElemwiseActivation only supports Two kinds of compound +operators (elementwise_op and activation_op): + + Z = Binary(X, Unary(Y)) + Z = Unary(Binary(X, Y)) + +The attributions of activation_op can be get from fused_elemwise_activation_op's +attributions. functor_list records the functors to be fused, for example +"scale,elementwise_add". + +)DOC"); + } + + private: + bool ValidCheck(const std::vector &functors) { + std::unordered_set unary_fun = {"scale", "relu"}; + std::unordered_set binary_fun = {"elementwise_add"}; + + std::string unary_fun_str; + if (binary_fun.count(functors[0])) { + unary_fun_str = functors[1]; + } else if (binary_fun.count(functors[1])) { + unary_fun_str = functors[0]; + } else { + PADDLE_THROW("%s and %s are not included in fused_list.", functors[0], + functors[1]); + } + PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1, + "%s is not included in fused_list.", unary_fun_str); + return true; + } +}; + +class FusedElemwiseActivationGradMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType(this->ForwardOpType() + "_grad"); + + for (auto &input_param : this->InputNames()) { + op_desc_ptr->SetInput(input_param, this->Input(input_param)); + op_desc_ptr->SetOutput(framework::GradVarName(input_param), + this->InputGrad(input_param, true)); + } + + for (auto &output_param : this->OutputNames()) { + op_desc_ptr->SetInput(output_param, this->Output(output_param)); + op_desc_ptr->SetInput(framework::GradVarName(output_param), + this->OutputGrad(output_param)); + } + op_desc_ptr->SetAttrMap(this->Attrs()); + + std::vector functor_names = + boost::get>( + op_desc_ptr->GetAttr("functor_list")); + functor_names[0] += "_grad"; + functor_names[1] += "_grad"; + op_desc_ptr->SetAttr("functor_list", functor_names); + return std::unique_ptr(op_desc_ptr); + } +}; + +class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Rank of first input must >= rank of second input."); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type_index = ctx.Input("X")->type(); + PADDLE_ENFORCE_EQ(input_data_type_index, + ctx.Input("Y")->type(), + "The element's type of input should be the same."); + PADDLE_ENFORCE_EQ( + input_data_type_index, + ctx.Input(framework::GradVarName("Out"))->type(), + "The element's type of input should be the same."); + + auto input_data_type = framework::ToDataType(input_data_type_index); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_elemwise_activation, ops::FusedElemwiseActivationOp, + ops::FusedElemwiseActivationMaker, + ops::FusedElemwiseActivationGradMaker); +REGISTER_OPERATOR(fused_elemwise_activation_grad, + ops::FusedElemwiseActivationOpGrad); + +REGISTER_OP_CPU_KERNEL( + fused_elemwise_activation, + ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel); + +REGISTER_OP_CPU_KERNEL( + fused_elemwise_activation_grad, + ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel); diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused_elemwise_activation_op.cu new file mode 100644 index 0000000000..e1d2b16b4b --- /dev/null +++ b/paddle/fluid/operators/fused_elemwise_activation_op.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused_elemwise_activation_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fused_elemwise_activation, + ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel); + +REGISTER_OP_CUDA_KERNEL( + fused_elemwise_activation_grad, + ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel); diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused_elemwise_activation_op.h new file mode 100644 index 0000000000..fe0017b824 --- /dev/null +++ b/paddle/fluid/operators/fused_elemwise_activation_op.h @@ -0,0 +1,425 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/elementwise_op_function.h" +#include "paddle/fluid/operators/math/functors.h" + +namespace math = paddle::operators::math; + +namespace paddle { +namespace operators { + +// CompoundFunctors +// For example: Z = Binary(X, Unary(Y)) +template +struct BinaryCompoundFunctor { + BinaryCompoundFunctor(const BinaryFun &binary_fun, const UnaryFun &unary_fun) + : binary_fun_(binary_fun), unary_fun_(unary_fun) {} + + inline HOSTDEVICE T operator()(T x, T y) { + return binary_fun_(x, unary_fun_(y)); + } + + private: + BinaryFun binary_fun_; + UnaryFun unary_fun_; +}; + +// For example: Z = Unary(Binary(X, Y)) +template +struct UnaryCompoundFunctor { + UnaryCompoundFunctor(const UnaryFun &unary_fun, const BinaryFun &binary_fun) + : unary_fun_(unary_fun), binary_fun_(binary_fun) {} + + inline HOSTDEVICE T operator()(T x, T y) { + return unary_fun_(binary_fun_(x, y)); + } + + private: + UnaryFun unary_fun_; + BinaryFun binary_fun_; +}; + +// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get +// the dx, one is to use the 'out', and the other is not to use it. +// the former method will save the time of recomputing the +// 'out', but it must occupy the memory to store the 'out'. +// While the later method can avoid occupying this memory, +// but it must recompute the 'out'. + +template +struct BinaryCompoundGradDxFunctor { + BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun, + const UnaryFun &unary_fun) + : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {} + + inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + if (Recomputation) { + return dout * d_binary_fun_(x, unary_fun_(y)); + } else { + return dout * d_binary_fun_(x, unary_fun_(y), out); + } + } + + private: + DBinaryFun d_binary_fun_; + UnaryFun unary_fun_; +}; + +template +struct BinaryCompoundGradDyFunctor { + BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun, + const UnaryFun &unary_fun, + const DUnaryFun &d_unary_fun) + : d_binary_fun_(d_binary_fun), + unary_fun_(unary_fun), + d_unary_fun_(d_unary_fun) {} + + inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + if (Recomputation) { + return dout * d_binary_fun_(unary_fun_(y), x) * d_unary_fun_(y); + } else { + return dout * d_binary_fun_(unary_fun_(y), x, out) * d_unary_fun_(y); + } + } + + private: + DBinaryFun d_binary_fun_; + UnaryFun unary_fun_; + DUnaryFun d_unary_fun_; +}; + +template +struct UnaryCompoundGradDxFunctor { + UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun, + const BinaryFun &binary_fun, + const DBinaryFun &d_binary_fun) + : d_unary_fun_(d_unary_fun), + binary_fun_(binary_fun), + d_binary_fun_(d_binary_fun) {} + + inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + T base; + if (Recomputation) { + base = dout * d_unary_fun_(binary_fun_(x, y)); + } else { + base = dout * d_unary_fun_(binary_fun_(x, y), out); + } + return base * d_binary_fun_(x, y); + } + + private: + DUnaryFun d_unary_fun_; + BinaryFun binary_fun_; + DBinaryFun d_binary_fun_; +}; + +template +struct UnaryCompoundGradDyFunctor { + UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun, + const BinaryFun &binary_fun, + const DBinaryFun &d_binary_fun) + : d_unary_fun_(d_unary_fun), + binary_fun_(binary_fun), + d_binary_fun_(d_binary_fun) {} + + inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + T base; + if (Recomputation) { + base = dout * d_unary_fun_(binary_fun_(x, y)); + } else { + base = dout * d_unary_fun_(binary_fun_(x, y), out); + } + return base * d_binary_fun_(y, x); + } + + private: + DUnaryFun d_unary_fun_; + BinaryFun binary_fun_; + DBinaryFun d_binary_fun_; +}; + +template +static void RunBinaryCompoundFunctor(const framework::ExecutionContext &ctx, + const BinaryFunctor &binary_functor, + const UnaryFunctor &unary_functor, + const framework::Tensor *in_x, + const framework::Tensor *in_y, + framework::Tensor *output) { + int axis = ctx.Attr("axis"); + using BinaryCompoundFunctor = + BinaryCompoundFunctor; + + ElementwiseComputeEx( + ctx, in_x, in_y, axis, + BinaryCompoundFunctor(binary_functor, unary_functor), output); +} + +template +static void RunUnaryCompoundFunctors(const framework::ExecutionContext &ctx, + const UnaryFunctor &unary_functor, + const BinaryFunctor &binary_functor, + const framework::Tensor *in_x, + const framework::Tensor *in_y, + framework::Tensor *output) { + int axis = ctx.Attr("axis"); + + using UnaryCompoundFunctor = + UnaryCompoundFunctor; + + ElementwiseComputeEx( + ctx, in_x, in_y, axis, + UnaryCompoundFunctor(unary_functor, binary_functor), output); +} + +template +static void RunBinaryCompoundGradFunctors( + const framework::ExecutionContext &ctx, + const BinaryGradFunctor &binary_grad_functor, + const UnaryFunctor &unary_functor, + const UnaryGradFunctor &unary_grad_functor, const framework::Tensor *in_x, + const framework::Tensor *in_y, const framework::Tensor *in_out, + const framework::Tensor *in_out_grad, framework::Tensor *x_grad, + framework::Tensor *y_grad) { + int axis = ctx.Attr("axis"); + + using BinaryCompoundDxFunctor = + BinaryCompoundGradDxFunctor; + using BinaryCompoundDyFunctor = + BinaryCompoundGradDyFunctor; + + ElemwiseGradCompute( + ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad, + BinaryCompoundDxFunctor(binary_grad_functor, unary_functor), + BinaryCompoundDyFunctor(binary_grad_functor, unary_functor, + unary_grad_functor)); +} + +template +static void RunUnaryCompoundGradFunctors( + const framework::ExecutionContext &ctx, + const UnaryGradFunctor &unary_grad_functor, + const BinaryFunctor &binary_functor, + const BinaryGradFunctor &binary_grad_functor, const framework::Tensor *in_x, + const framework::Tensor *in_y, const framework::Tensor *in_out, + const framework::Tensor *in_out_grad, framework::Tensor *x_grad, + framework::Tensor *y_grad) { + int axis = ctx.Attr("axis"); + + using UnaryCompoundDxFunctor = + UnaryCompoundGradDxFunctor; + using UnaryCompoundDyFunctor = + UnaryCompoundGradDyFunctor; + + ElemwiseGradCompute( + ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad, + UnaryCompoundDxFunctor(unary_grad_functor, binary_functor, + binary_grad_functor), + UnaryCompoundDyFunctor(unary_grad_functor, binary_functor, + binary_grad_functor)); +} + +template +static void RunFunctors(const framework::ExecutionContext &ctx, + const framework::Tensor *in_x, + const framework::Tensor *in_y, + framework::Tensor *output) { + auto &functors = ctx.Attr>("functor_list"); + auto funcs_str = functors[0] + "," + functors[1]; + // TODO(zcd): The following code can be refined. + if (funcs_str == "elementwise_add,scale") { + // Z = Binary(X, Unary(Y)) + T scale = static_cast(ctx.Attr("scale")); + RunBinaryCompoundFunctor, + math::ScaleFunctor>( + ctx, math::AddFunctor(), math::ScaleFunctor(scale), in_x, in_y, + output); + } else if (funcs_str == "scale,elementwise_add") { + // Z = Unary(Binary(X, Y)) + T scale = static_cast(ctx.Attr("scale")); + RunUnaryCompoundFunctors, + math::AddFunctor>( + ctx, math::ScaleFunctor(scale), math::AddFunctor(), in_x, in_y, + output); + } else if (funcs_str == "elementwise_add,relu") { + RunBinaryCompoundFunctor, + math::ReluFunctor>( + ctx, math::AddFunctor(), math::ReluFunctor(), in_x, in_y, output); + } else if (funcs_str == "relu,elementwise_add") { + RunUnaryCompoundFunctors, + math::AddFunctor>( + ctx, math::ReluFunctor(), math::AddFunctor(), in_x, in_y, output); + } else { + PADDLE_THROW("%s has not been implemented.", funcs_str); + } +} + +template +static void RunGradFunctors(const framework::ExecutionContext &ctx, + const framework::Tensor *in_x, + const framework::Tensor *in_y, + const framework::Tensor *in_out, + const framework::Tensor *in_out_grad, + framework::Tensor *x_grad, + framework::Tensor *y_grad) { + auto &functors = ctx.Attr>("functor_list"); + auto funcs_str = functors[0] + "," + functors[1]; + + bool recomputation = ctx.Attr("recomputation"); + + // TODO(zcd): The following code can be refined. for example, use registion + if (funcs_str == "elementwise_add_grad,scale_grad") { + // The backward of Z = Binary(X, Unary(Y)) + T scale = static_cast(ctx.Attr("scale")); + if (recomputation) { + RunBinaryCompoundGradFunctors, + math::ScaleFunctor, + math::ScaleGradFunctor, true>( + ctx, math::AddGradFunctor(), math::ScaleFunctor(scale), + math::ScaleGradFunctor(scale), in_x, in_y, in_out, in_out_grad, + x_grad, y_grad); + } else { + RunBinaryCompoundGradFunctors, + math::ScaleFunctor, + math::ScaleGradFunctor, false>( + ctx, math::AddGradFunctor(), math::ScaleFunctor(scale), + math::ScaleGradFunctor(scale), in_x, in_y, in_out, in_out_grad, + x_grad, y_grad); + } + } else if (funcs_str == "scale_grad,elementwise_add_grad") { + // The backward of Z = Unary(Binary(X, Y)) + T scale = static_cast(ctx.Attr("scale")); + if (recomputation) { + RunUnaryCompoundGradFunctors, + math::AddFunctor, math::AddGradFunctor, + true>(ctx, math::ScaleGradFunctor(scale), + math::AddFunctor(), + math::AddGradFunctor(), in_x, in_y, + in_out, in_out_grad, x_grad, y_grad); + } else { + RunUnaryCompoundGradFunctors, + math::AddFunctor, math::AddGradFunctor, + false>(ctx, math::ScaleGradFunctor(scale), + math::AddFunctor(), + math::AddGradFunctor(), in_x, in_y, + in_out, in_out_grad, x_grad, y_grad); + } + } else if (funcs_str == "elementwise_add_grad,relu_grad") { + if (recomputation) { + RunBinaryCompoundGradFunctors, + math::ReluFunctor, + math::ReluGradFunctor, true>( + ctx, math::AddGradFunctor(), math::ReluFunctor(), + math::ReluGradFunctor(), in_x, in_y, in_out, in_out_grad, x_grad, + y_grad); + } else { + RunBinaryCompoundGradFunctors, + math::ReluFunctor, + math::ReluGradFunctor, false>( + ctx, math::AddGradFunctor(), math::ReluFunctor(), + math::ReluGradFunctor(), in_x, in_y, in_out, in_out_grad, x_grad, + y_grad); + } + } else if (funcs_str == "relu_grad,elementwise_add_grad") { + if (recomputation) { + RunUnaryCompoundGradFunctors, + math::AddFunctor, math::AddGradFunctor, + true>(ctx, math::ReluGradFunctor(), + math::AddFunctor(), + math::AddGradFunctor(), in_x, in_y, + in_out, in_out_grad, x_grad, y_grad); + } else { + RunUnaryCompoundGradFunctors, + math::AddFunctor, math::AddGradFunctor, + false>(ctx, math::ReluGradFunctor(), + math::AddFunctor(), + math::AddGradFunctor(), in_x, in_y, + in_out, in_out_grad, x_grad, y_grad); + } + } else { + PADDLE_THROW("%s has not been implemented.", funcs_str); + } +} + +template +class FusedElemwiseActivationKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto &in_x = detail::Ref(ctx.Input("X"), + "Cannot get input tensor %s, variable name = %s", + "X", ctx.op().Input("X")); + auto &in_y = detail::Ref(ctx.Input("Y"), + "Cannot get input tensor %s, variable name = %s", + "Y", ctx.op().Input("Y")); + auto &output = detail::Ref(ctx.Output("Out"), + "Cannot get input tensor %s, variable name = %s", + "Out", ctx.op().Output("Out")); + + RunFunctors(ctx, &in_x, &in_y, &output); + } +}; + +template +class FusedElemwiseActivationGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto &in_x = detail::Ref(ctx.Input("X"), + "Cannot get input tensor %s, variable name = %s", + "X", ctx.op().Input("X")); + auto &in_y = detail::Ref(ctx.Input("Y"), + "Cannot get input tensor %s, variable name = %s", + "Y", ctx.op().Input("Y")); + auto &in_out = detail::Ref(ctx.Input("Out"), + "Cannot get input tensor %s, variable name = %s", + "Out", ctx.op().Input("Out")); + auto &in_out_grad = + detail::Ref(ctx.Input(framework::GradVarName("Out")), + "Cannot get input tensor %s, variable name = %s", + framework::GradVarName("Out"), + ctx.op().Input(framework::GradVarName("Out"))); + + framework::Tensor *x_grad = + ctx.Output(framework::GradVarName("X")); + framework::Tensor *y_grad = + ctx.Output(framework::GradVarName("Y")); + + RunGradFunctors(ctx, &in_x, &in_y, &in_out, &in_out_grad, + x_grad, y_grad); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h new file mode 100644 index 0000000000..ad2f49ccbf --- /dev/null +++ b/paddle/fluid/operators/math/functors.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { +namespace operators { +namespace math { + +// AddFunctor +template +struct AddFunctor { + // out = x + y; + inline HOSTDEVICE T operator()(T x, T y) { return x + y; } +}; + +template +struct AddGradFunctor { + inline HOSTDEVICE T operator()(T x, T y) { return 1; } + + inline HOSTDEVICE T operator()(T x, T y, T out) const { return 1; } +}; + +template +struct ScaleFunctor { + explicit ScaleFunctor(const T coeff) : coeff_(coeff) {} + + inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; } + + private: + T coeff_; +}; + +template +struct ScaleGradFunctor { + explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {} + + inline HOSTDEVICE T operator()(T x) { return coeff_; } + + inline HOSTDEVICE T operator()(T x, T out) { return coeff_; } + + private: + T coeff_; +}; + +template +struct ReluFunctor { + inline HOSTDEVICE T operator()(T x) { return x * (x > 0); } +}; + +template +struct ReluGradFunctor { + inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; } + + inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 5ed387fb12..34f9cf0620 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -313,9 +313,9 @@ class TestAbs(OpTest): self.init_dtype() x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype) - # Because we set delta = 0.005 in caculating numeric gradient, + # Because we set delta = 0.005 in calculating numeric gradient, # if x is too small, such as 0.002, x_neg will be -0.003 - # x_pos will be 0.007, so the numeric gradient is unaccurate. + # x_pos will be 0.007, so the numeric gradient is inaccurate. # we should avoid this x[np.abs(x) < 0.005] = 0.02 out = np.abs(x) diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py new file mode 100644 index 0000000000..ec0a939e9e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py @@ -0,0 +1,818 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import numpy as np +import paddle.fluid.core as core +from op_test import OpTest + +# scale + add +# TestElementwiseAddOp +# TestFusedOperatorsOp_scalar +# TestFusedOperatorsOp_scalar2 +# TestFusedOperatorsOp_Vector +# TestFusedOperatorsOp_broadcast_0 +# TestFusedOperatorsOp_broadcast_1 +# TestFusedOperatorsOp_broadcast_2 +# TestFusedOperatorsOp_broadcast_3 +# TestFusedOperatorsOp_broadcast_4 +# TestFusedOperatorsOp_rowwise_add_0 +# TestFusedOperatorsOp_rowwise_add_1 +# TestFusedOperatorsOp_channelwise_add + + +class TestElementwiseAddOp(OpTest): + def setUp(self): + self.op_type = "fused_elemwise_activation" + self.dtype = np.float32 + self.axis = -1 + + self.init_axis() + self.init_dtype() + self.init_input() + self.init_output() + self.init_attr() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.outputs = {'Out': self.out} + + def init_input(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y) * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["scale", "elementwise_add"] + } + + def init_dtype(self): + pass + + def init_axis(self): + pass + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005) + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y')) + + +class TestFusedOperatorsOp_scalar(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y) * self.scale + + +class TestFusedOperatorsOp_scalar2(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1, 1).astype(self.dtype) + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y) * self.scale + + +class TestFusedOperatorsOp_Vector(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.random((32, )).astype(self.dtype) + self.y = np.random.random((32, )).astype(self.dtype) + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y) * self.scale + + +class TestFusedOperatorsOp_broadcast_0(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(2).astype(self.dtype) + + def init_axis(self): + self.axis = 0 + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y.reshape(2, 1, 1)) * self.scale + + +class TestFusedOperatorsOp_broadcast_1(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(3).astype(self.dtype) + + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y.reshape(1, 3, 1)) * self.scale + + +class TestFusedOperatorsOp_broadcast_2(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(4).astype(self.dtype) + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y.reshape(1, 1, 4)) * self.scale + + +class TestFusedOperatorsOp_broadcast_3(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) + self.y = np.random.rand(3, 4).astype(self.dtype) + + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y.reshape(1, 3, 4, 1)) * self.scale + + +class TestFusedOperatorsOp_broadcast_4(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) + self.y = np.random.rand(2, 1).astype(self.dtype) + + def init_axis(self): + self.axis = 0 + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y.reshape(2, 1, 1, 1)) * self.scale + + +class TestFusedOperatorsOp_rowwise_add_0(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(3, 4).astype(self.dtype) + + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y.reshape(1, 3, 4)) * self.scale + + +class TestFusedOperatorsOp_rowwise_add_1(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.rand(2, 1).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y.reshape(1, 1)) * self.scale + + +class TestFusedOperatorsOp_channelwise_add(TestElementwiseAddOp): + def init_input(self): + self.x = np.random.rand(3, 20, 20).astype(self.dtype) + self.y = np.random.rand(3, 1, 1).astype(self.dtype) + + def init_axis(self): + self.axis = -1 + + def init_output(self): + self.scale = 0.1 + self.out = (self.x + self.y) * self.scale + + +# add + scale +# TestElementwiseAddOp_f_add_scale +# TestFusedOperatorsOp_scalar_f_add_scale +# TestFusedOperatorsOp_scalar2_f_add_scale +# TestFusedOperatorsOp_Vector_f_add_scale +# TestFusedOperatorsOp_broadcast_0_f_add_scale +# TestFusedOperatorsOp_broadcast_1_f_add_scale +# TestFusedOperatorsOp_broadcast_2_f_add_scale +# TestFusedOperatorsOp_broadcast_3_f_add_scale +# TestFusedOperatorsOp_broadcast_4_f_add_scale +# TestFusedOperatorsOp_rowwise_add_0_f_add_scale +# TestFusedOperatorsOp_rowwise_add_1_f_add_scale +# TestFusedOperatorsOp_channelwise_add_f_add_scale + + +class TestFusedOperatorsOp_f_add_scale(TestElementwiseAddOp): + def init_output(self): + self.scale = 0.1 + self.out = self.x + self.y * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_scalar_f_add_scale(TestFusedOperatorsOp_scalar): + def init_output(self): + self.scale = 0.1 + self.out = self.x + self.y * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_scalar2_f_add_scale(TestFusedOperatorsOp_scalar2): + def init_output(self): + self.scale = 0.1 + self.out = self.x + self.y * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_Vector_f_add_scale(TestFusedOperatorsOp_Vector): + def init_output(self): + self.scale = 0.1 + self.out = self.x + self.y * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_broadcast_0_f_add_scale( + TestFusedOperatorsOp_broadcast_0): + def init_axis(self): + self.axis = 0 + + def init_output(self): + self.scale = 0.1 + self.out = self.x + self.y.reshape(2, 1, 1) * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_broadcast_1_f_add_scale( + TestFusedOperatorsOp_broadcast_1): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.scale = 0.1 + self.out = self.x + self.y.reshape(1, 3, 1) * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_broadcast_2_f_add_scale( + TestFusedOperatorsOp_broadcast_2): + def init_output(self): + self.scale = 0.1 + self.out = self.x + self.y.reshape(1, 1, 4) * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_broadcast_3_f_add_scale( + TestFusedOperatorsOp_broadcast_3): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.scale = 0.1 + self.out = self.x + self.y.reshape(1, 3, 4, 1) * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_broadcast_4_f_add_scale( + TestFusedOperatorsOp_broadcast_4): + def init_axis(self): + self.axis = 0 + + def init_output(self): + self.scale = 0.2 + self.out = self.x + self.y.reshape(2, 1, 1, 1) * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_rowwise_add_0_f_add_scale( + TestFusedOperatorsOp_rowwise_add_0): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.scale = 0.1 + self.out = self.x + self.y.reshape(1, 3, 4) * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_rowwise_add_1_f_add_scale( + TestFusedOperatorsOp_rowwise_add_1): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.scale = 0.2 + self.out = self.x + self.y.reshape(1, 1) * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +class TestFusedOperatorsOp_channelwise_add_f_add_scale( + TestFusedOperatorsOp_channelwise_add): + def init_axis(self): + self.axis = -1 + + def init_output(self): + self.scale = 0.2 + self.out = self.x + self.y * self.scale + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'scale': self.scale, + 'functor_list': ["elementwise_add", "scale"] + } + + +# add + relu +# TestElementwiseAddOp_f_add_relu +# TestFusedOperatorsOp_scalar_f_add_relu +# TestFusedOperatorsOp_scalar2_f_add_relu +# TestFusedOperatorsOp_Vector_f_add_relu +# TestFusedOperatorsOp_broadcast_0_f_add_relu +# TestFusedOperatorsOp_broadcast_1_f_add_relu +# TestFusedOperatorsOp_broadcast_2_f_add_relu +# TestFusedOperatorsOp_broadcast_3_f_add_relu +# TestFusedOperatorsOp_broadcast_4_f_add_relu +# TestFusedOperatorsOp_rowwise_add_0_f_add_relu +# TestFusedOperatorsOp_rowwise_add_1_f_add_relu +# TestFusedOperatorsOp_channelwise_add_f_add_relu + + +class TestFusedOperatorsOp_f_add_relu(TestElementwiseAddOp): + def init_output(self): + # Copy from test_activation_op.py + # Because we set delta = 0.005 in calculating numeric gradient, + # if x is too small, such as 0.002, x_neg will be -0.003 + # x_pos will be 0.007, so the numeric gradient is inaccurate. + # we should avoid this + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y, 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_scalar_f_add_relu(TestFusedOperatorsOp_scalar): + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y, 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_scalar2_f_add_relu(TestFusedOperatorsOp_scalar2): + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y, 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_Vector_f_add_relu(TestFusedOperatorsOp_Vector): + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y, 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_broadcast_0_f_add_relu( + TestFusedOperatorsOp_broadcast_0): + def init_axis(self): + self.axis = 0 + + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y.reshape(2, 1, 1), 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_broadcast_1_f_add_relu( + TestFusedOperatorsOp_broadcast_1): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y.reshape(1, 3, 1), 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_broadcast_2_f_add_relu( + TestFusedOperatorsOp_broadcast_2): + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y.reshape(1, 1, 4), 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_broadcast_3_f_add_relu( + TestFusedOperatorsOp_broadcast_3): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y.reshape(1, 3, 4, 1), 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_broadcast_4_f_add_relu( + TestFusedOperatorsOp_broadcast_4): + def init_axis(self): + self.axis = 0 + + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y.reshape(2, 1, 1, 1), 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_rowwise_add_0_f_add_relu( + TestFusedOperatorsOp_rowwise_add_0): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y.reshape(1, 3, 4), 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_rowwise_add_1_f_add_relu( + TestFusedOperatorsOp_rowwise_add_1): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y.reshape(1, 1), 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +class TestFusedOperatorsOp_channelwise_add_f_add_relu( + TestFusedOperatorsOp_channelwise_add): + def init_axis(self): + self.axis = -1 + + def init_output(self): + self.y[np.abs(self.y) < 0.005] = 0.02 + self.out = self.x + np.maximum(self.y, 0) + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["elementwise_add", "relu"] + } + + +# relu + add +# TestElementwiseAddOp_f_relu_add +# TestFusedOperatorsOp_scalar_f_relu_add +# TestFusedOperatorsOp_scalar2_f_relu_add +# TestFusedOperatorsOp_Vector_f_relu_add +# TestFusedOperatorsOp_broadcast_0_f_relu_add +# TestFusedOperatorsOp_broadcast_1_f_relu_add +# TestFusedOperatorsOp_broadcast_2_f_relu_add +# TestFusedOperatorsOp_broadcast_3_f_relu_add +# TestFusedOperatorsOp_broadcast_4_f_relu_add +# TestFusedOperatorsOp_rowwise_add_0_f_relu_add +# TestFusedOperatorsOp_rowwise_add_1_f_relu_add +# TestFusedOperatorsOp_channelwise_add_f_relu_add + + +class TestFusedOperatorsOp_f_relu_add(TestElementwiseAddOp): + def init_output(self): + # Copy from test_activation_op.py + # Because we set delta = 0.005 in calculating numeric gradient, + # if x is too small, such as 0.002, x_neg will be -0.003 + # x_pos will be 0.007, so the numeric gradient is inaccurate. + # we should avoid this + self.out = self.x + self.y + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_scalar_f_relu_add(TestFusedOperatorsOp_scalar): + def init_output(self): + self.out = self.x + self.y + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_scalar2_f_relu_add(TestFusedOperatorsOp_scalar2): + def init_output(self): + self.out = self.x + self.y + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_Vector_f_relu_add(TestFusedOperatorsOp_Vector): + def init_output(self): + self.out = self.x + self.y + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_broadcast_0_f_relu_add( + TestFusedOperatorsOp_broadcast_0): + def init_axis(self): + self.axis = 0 + + def init_output(self): + self.out = self.x + self.y.reshape(2, 1, 1) + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_broadcast_1_f_relu_add( + TestFusedOperatorsOp_broadcast_1): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.out = self.x + self.y.reshape(1, 3, 1) + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_broadcast_2_f_relu_add( + TestFusedOperatorsOp_broadcast_2): + def init_output(self): + self.out = self.x + self.y.reshape(1, 1, 4) + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_broadcast_3_f_relu_add( + TestFusedOperatorsOp_broadcast_3): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.out = self.x + self.y.reshape(1, 3, 4, 1) + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_broadcast_4_f_relu_add( + TestFusedOperatorsOp_broadcast_4): + def init_axis(self): + self.axis = 0 + + def init_output(self): + self.out = self.x + self.y.reshape(2, 1, 1, 1) + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_rowwise_add_0_f_relu_add( + TestFusedOperatorsOp_rowwise_add_0): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.out = self.x + self.y.reshape(1, 3, 4) + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_rowwise_add_1_f_relu_add( + TestFusedOperatorsOp_rowwise_add_1): + def init_axis(self): + self.axis = 1 + + def init_output(self): + self.out = self.x + self.y.reshape(1, 1) + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +class TestFusedOperatorsOp_channelwise_add_f_relu_add( + TestFusedOperatorsOp_channelwise_add): + def init_axis(self): + self.axis = -1 + + def init_output(self): + self.out = self.x + self.y + self.out = np.maximum(self.out, 0) + self.out[np.abs(self.out) < 0.005] = 0.02 + + def init_attr(self): + self.attrs = { + 'axis': self.axis, + 'functor_list': ["relu", "elementwise_add"] + } + + +if __name__ == '__main__': + unittest.main() From c69ae865db87b72535fe831a7a580dcc7a5092c6 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 8 Aug 2018 11:54:39 +0000 Subject: [PATCH 11/22] fix comments --- .../analysis/data_flow_graph_to_fluid_pass.cc | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 8eac84343d..aaf7ca6701 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -87,7 +87,7 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { } void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, - framework::proto::BlockDesc &block) { + framework::proto::BlockDesc *block) { static int counter{0}; PADDLE_ENFORCE(node->IsFunctionBlock()); framework::OpDesc desc; @@ -112,11 +112,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, desc.SetType("tensorrt_engine"); std::unordered_map output_name_map; - auto subgraph_nodes = func->subgraph; - for (int index = 0; index < block.ops_size(); index++) { - framework::proto::OpDesc *op = block.mutable_ops(index); - // auto &op = block.mutable_ops(index); + // The following procedure is used to rename all the intermediate + // variables and the output variables of the subgraph. + // Why we do this? + // During the transition from fluid OP to tensorrt OP, we map + // the input and output Tensor(fluid data structure) of fluid OP + // to the correspondin ITensor (trt data structure) through the + // Tensor name. When we set up ITensor for an variable, we must + // ensure that it has not been set before. + // If there is variable in the fluid graph, which is not only the + // input of a OP, but also the output of a Op, there will be problems. + // So we have to rename the variable in the subgraph to make sure + // it is either an OP's input or an OP's output. + + auto subgraph_nodes = func->subgraph; + for (int index = 0; index < block->ops_size(); index++) { + framework::proto::OpDesc *op = block->mutable_ops(index); auto correspond_node = subgraph_nodes[index]; PADDLE_ENFORCE_EQ(correspond_node->name(), op->type()); @@ -124,10 +136,9 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, for (auto *in_var : correspond_node->inlinks) { var2id[in_var->name()] = in_var->id(); } - // TODO(zhaolong): add comments + // rename for the input variables of op inside subgraph for (int i = 0; i < op->inputs_size(); i++) { framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i); - // auto &in_var = op->mutable_inputs(i); std::vector replaced_names; for (int k = 0; k < in_var->arguments_size(); k++) { std::string arg_value = in_var->arguments(k); @@ -148,6 +159,7 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, var2id[out_var->name()] = out_var->id(); } + // rename for the output variables of op inside subgraph for (int i = 0; i < op->outputs_size(); i++) { framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); std::vector replaced_names; @@ -165,15 +177,18 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, } } } + // When tensorrt engine runs at the end of the operation, + // output_mapping help us copy the data from the renamed ITensor + // to Tensor. std::vector output_mapping; for (auto name : output_names) { PADDLE_ENFORCE(output_name_map.count(name) != 0); output_mapping.push_back(output_name_map[name]); } - PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc"); + PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc"); // Set attrs - SetAttr(desc.Proto(), "subgraph", block.SerializeAsString()); + SetAttr(desc.Proto(), "subgraph", block->SerializeAsString()); SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize); SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size); @@ -220,7 +235,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { *block_desc.Proto()->mutable_vars() = argument_->origin_program_desc->blocks(0).vars(); PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); - CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto()); + CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto()); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *op = main_block->add_ops(); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); From 8e604a10aaa713a42cf2693c65bc6d9dfebbb139 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 8 Aug 2018 20:09:56 +0800 Subject: [PATCH 12/22] fix mac compile error --- paddle/fluid/operators/parallel_do_op.cc | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index 916cdad3fd..eb09470f37 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -163,12 +163,11 @@ class ParallelDoOp : public framework::OperatorBase { auto &place = places[place_idx]; auto *cur_scope = sub_scopes[place_idx]; - workers.emplace_back( - framework::Async([program, cur_scope, place, block, place_idx] { - framework::Executor executor(place); - executor.Run(*program, cur_scope, block->ID(), - false /*create_local_scope*/); - })); + workers.emplace_back(framework::Async([program, cur_scope, place, block] { + framework::Executor executor(place); + executor.Run(*program, cur_scope, block->ID(), + false /*create_local_scope*/); + })); } for (auto &worker : workers) { worker.wait(); @@ -239,12 +238,11 @@ class ParallelDoGradOp : public framework::OperatorBase { auto *cur_scope = sub_scopes[i]; // execute - workers.emplace_back( - framework::Async([program, cur_scope, place, block, i] { - framework::Executor executor(place); - executor.Run(*program, cur_scope, block->ID(), - false /*create_local_scope*/); - })); + workers.emplace_back(framework::Async([program, cur_scope, place, block] { + framework::Executor executor(place); + executor.Run(*program, cur_scope, block->ID(), + false /*create_local_scope*/); + })); } for (auto &worker : workers) { worker.wait(); From e823ce68bbf7a0f86e2dc45120efa8cc155390d4 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 8 Aug 2018 13:26:19 +0000 Subject: [PATCH 13/22] filter redundant output --- .../inference/analysis/data_flow_graph.cc | 28 +++++++++++++++++++ .../inference/analysis/data_flow_graph.h | 1 + .../analysis/data_flow_graph_to_fluid_pass.cc | 1 + 3 files changed, 30 insertions(+) diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index 8a3af0a8eb..7f64bc75ae 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -337,6 +337,34 @@ ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT std::vector(outputs.begin(), outputs.end())); } +void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { + std::vector op_nodes; + for (auto &node : GraphTraits(graph).nodes_in_TS()) { + if (node.type() == Node::Type::kValue || node.deleted()) { + continue; + } + op_nodes.push_back(&node); + } + size_t op_num = op_nodes.size(); + for (size_t i = 0; i < op_num; i++) { + if (op_nodes[i]->type() == Node::Type::kFunction) continue; + std::unordered_set follow_up_input_names; + for (size_t j = i + 1; j < op_num; j++) { + for (auto *in : op_nodes[j]->inlinks) { + follow_up_input_names.insert(in->name()); + } + } + std::vector filtered_subgraph_outlinks; + for (auto *out : op_nodes[i]->outlinks) { + if (follow_up_input_names.count(out->name())) { + filtered_subgraph_outlinks.push_back(out); + } + } + PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL); + op_nodes[i]->outlinks = filtered_subgraph_outlinks; + } +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index 16aeae4d35..bb3ec6bbc1 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -178,6 +178,7 @@ struct GraphTraits { std::pair, std::vector> ExtractInputAndOutputOfSubGraph(std::vector &graph); // NOLINT +void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph); } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 2328d87042..7365c826a8 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -52,6 +52,7 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) { bool DataFlowGraphToFluidPass::Finalize() { return true; } void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { + FilterRedundantOutputOfSubGraph(graph); LOG(INFO) << "graph.inputs " << graph->inputs.size(); for (auto &node : GraphTraits(graph).nodes_in_TS()) { if (node.deleted()) continue; From c4f8afa2582031c578f852439bd2b5d6864668c1 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 8 Aug 2018 13:48:02 +0000 Subject: [PATCH 14/22] Fix bug when memopt optimize loss.grad and use ParallelExecutor --- .../details/multi_devices_graph_builder.cc | 14 ++++++++------ .../details/multi_devices_graph_builder.h | 4 +++- paddle/fluid/framework/operator.cc | 6 ++++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 5ca2ed8f96..a4fdbcb26d 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -275,7 +275,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( if (strategy_.gradient_scale_ != BuildStrategy::GradientScaleStrategy::kCustomized) { // TODO(paddle-dev): Why is there no input for this op_handle? - CreateScaleLossGradOp(&result); + auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; + CreateScaleLossGradOp(&result, loss_grad_name); } // This assumes the backward generating code will ensure IsScaleLossOp // is true only for the op that scale the final scalar loss. @@ -535,7 +536,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph, return got == sharded_var_device.end() ? -1 : got->second; } -void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { +void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( + ir::Graph *result, const std::string &loss_grad_name) const { for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle #ifdef PADDLE_WITH_CUDA @@ -558,10 +560,10 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { // loss->pending_ops_.emplace_back(op_handle); // op_handle->inputs_.emplace_back(loss); - CreateOpOutput(result, op_handle, - result->CreateEmptyNode(GradVarName(loss_var_name_), - ir::Node::Type::kVariable), - places_[i], i); + CreateOpOutput( + result, op_handle, + result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable), + places_[i], i); } } diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 099dbe5abe..f2cb6bb1c8 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -75,7 +75,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { void CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const; - void CreateScaleLossGradOp(ir::Graph *result) const; + void CreateScaleLossGradOp(ir::Graph *result, + const std::string &loss_grad_name) const; + VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const; void CreateComputationalOp(ir::Graph *result, ir::Node *node, diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 0c8acf71bf..16c7f819f3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -778,6 +778,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { auto& scope = ctx.scope(); int data_type = -1; + std::string last_input_name; for (auto& input : this->inputs_) { for (auto& ipt_name : input.second) { auto* var = scope.FindVar(ipt_name); @@ -794,9 +795,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( int tmp = static_cast(ToDataType(t->type())); PADDLE_ENFORCE( tmp == data_type || data_type == -1, - "DataType of Paddle Op %s must be the same. Get %d != %d", Type(), - data_type, tmp); + "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)", + Type(), last_input_name, data_type, ipt_name, tmp); data_type = tmp; + last_input_name = ipt_name; } } } From 943950c1908413cc26812dc94c23ca6e8426de16 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 8 Aug 2018 14:32:04 +0000 Subject: [PATCH 15/22] refine graph draw --- paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc index a6f8548475..c05b0e5d46 100644 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc @@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) { for (size_t i = 0; i < graph->nodes.size(); i++) { const Node &node = graph->nodes.Get(i); if (!config_.display_deleted_node && node.deleted()) continue; - for (auto &in : node.inlinks) { - if (!config_.display_deleted_node && in->deleted()) continue; - dot.AddEdge(in->repr(), node.repr(), {}); + for (auto &out : node.outlinks) { + if (!config_.display_deleted_node && out->deleted()) continue; + dot.AddEdge(node.repr(), out->repr(), {}); } } return dot.Build(); From 8da651057cdcdf266b8205a023473f4c8c1e7189 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Thu, 9 Aug 2018 14:46:28 +0800 Subject: [PATCH 16/22] add TestAsyncDistLookupTable --- .../tests/unittests/test_dist_transpiler.py | 41 +++++++++++++++++++ .../fluid/transpiler/distribute_transpiler.py | 20 +++++---- 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index b24036326d..91b80ca75f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -464,5 +464,46 @@ class TestDistLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) +class TestAsyncDistLookupTable(TestDistLookupTableBase): + def net_conf(self): + self.network_with_table(is_sparse=True, is_distributed=True) + + def transpiler_test_impl(self): + config = fluid.DistributeTranspilerConfig() + config.sync_mode = False + + pserver1, startup1 = self.get_pserver(self.pserver1_ep, config) + + self.assertEqual(len(pserver1.blocks), 6) + # 0 listen_and_serv + # 1 optimize for fc_w or fc_b adam + self.assertEqual([op.type for op in pserver1.blocks[1].ops], + ["adam", "scale", "scale"]) + # 2 optimize for table sgd + self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"]) + # 3 prefetch -> lookup_sparse_table for data0 + self.assertEqual([op.type for op in pserver1.blocks[3].ops], + ["lookup_sparse_table"]) + # 4 prefetch -> lookup_sparse_table for data1 + self.assertEqual([op.type for op in pserver1.blocks[4].ops], + ["lookup_sparse_table"]) + # 5 save table + self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"]) + + trainer = self.get_trainer(config) + self.assertEqual(len(trainer.blocks), 1) + print([op.type for op in trainer.blocks[0].ops]) + ops = [ + 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids', + 'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul', + 'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', + 'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', + 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'sum', 'split_ids', 'send', 'recv', 'recv' + ] + self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index b0a100e1db..be96c20db1 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -124,6 +124,7 @@ class DistributeTranspilerConfig(object): slice_var_up = True split_method = None min_block_size = 8192 + sync_mode = True class DistributeTranspiler(object): @@ -197,7 +198,7 @@ class DistributeTranspiler(object): program = default_main_program() self.origin_program = program self.trainer_num = trainers - self.sync_mode = sync_mode + self.sync_mode = sync_mode and self.config.sync_mode self.trainer_id = trainer_id pserver_endpoints = pservers.split(",") self.pserver_endpoints = pserver_endpoints @@ -293,14 +294,15 @@ class DistributeTranspiler(object): RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) - program.global_block().append_op( - type="fetch_barrier", - inputs={}, - outputs={}, - attrs={ - "endpoints": pserver_endpoints, - RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE - }) + if self.sync_mode: + program.global_block().append_op( + type="fetch_barrier", + inputs={}, + outputs={}, + attrs={ + "endpoints": pserver_endpoints, + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE + }) for varname, splited_var in self.param_var_mapping.iteritems(): if len(splited_var) <= 1: From 1b6902133d8e920f419b42088a695f0ea2511f72 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Thu, 9 Aug 2018 15:04:15 +0800 Subject: [PATCH 17/22] add TestAsyncLocalLookupTable --- .../tests/unittests/test_dist_transpiler.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 91b80ca75f..1f37e4e91d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -464,6 +464,39 @@ class TestDistLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) +class TestAsyncLocalLookupTable(TestDistLookupTableBase): + def net_conf(self): + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + config = fluid.DistributeTranspilerConfig() + config.sync_mode = False + pserver1, startup1 = self.get_pserver(self.pserver1_ep, config) + + self.assertEqual(len(pserver1.blocks), 3) + # 0 listen_and_serv + # 1 optimize for fc_w or fc_b adam + self.assertEqual([op.type for op in pserver1.blocks[1].ops], + ["adam", "scale", "scale"]) + # 2 optimize for table adam + # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num + self.assertEqual([op.type for op in pserver1.blocks[2].ops], + ["adam", "scale", "scale"]) + + trainer = self.get_trainer(config) + self.assertEqual(len(trainer.blocks), 1) + ops = [ + 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', + 'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean', + 'fill_constant', 'mean_grad', 'cross_entropy_grad', + 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', + 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv', + 'recv', 'recv', 'concat' + ] + self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) + + class TestAsyncDistLookupTable(TestDistLookupTableBase): def net_conf(self): self.network_with_table(is_sparse=True, is_distributed=True) From 0954adbdcd0218c060082cbb7384df967e2fb0ad Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 9 Aug 2018 17:20:08 +0800 Subject: [PATCH 18/22] move download urls --- python/paddle/dataset/conll05.py | 8 ++++---- python/paddle/dataset/wmt14.py | 2 +- python/paddle/v2/dataset/conll05.py | 8 ++++---- python/paddle/v2/dataset/wmt14.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py index 4e94ce8989..a7c3c5402e 100644 --- a/python/paddle/dataset/conll05.py +++ b/python/paddle/dataset/conll05.py @@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert'] DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' DATA_MD5 = '387719152ae52d60422c016e92a742fc' -WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' +WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt' WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' -VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' +VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt' VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' -TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' +TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt' TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' -EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' +EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb' EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' UNK_IDX = 0 diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py index f0908c7378..7a157e3497 100644 --- a/python/paddle/dataset/wmt14.py +++ b/python/paddle/dataset/wmt14.py @@ -40,7 +40,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' 'wmt_shrinked_data/wmt14.tgz') MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' # BLEU of this trained model is 26.92 -URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' +URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz' MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' START = "" diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index 0d544efac9..8312900dc4 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert'] DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' DATA_MD5 = '387719152ae52d60422c016e92a742fc' -WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' +WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt' WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' -VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' +VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt' VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' -TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' +TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt' TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' -EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' +EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb' EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' UNK_IDX = 0 diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 5104e29051..1ec210f265 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -41,7 +41,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' 'wmt_shrinked_data/wmt14.tgz') MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' # BLEU of this trained model is 26.92 -URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' +URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz' MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' START = "" From cef35855353750c7cd3d55a04ebea5936c6c2b51 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 6 Aug 2018 09:13:10 +0000 Subject: [PATCH 19/22] add cub support --- CMakeLists.txt | 1 + cmake/external/cub.cmake | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 cmake/external/cub.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 23bb27e77b..01af4d0f3c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,6 +156,7 @@ include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) +include(external/cub) if(WITH_DISTRIBUTE) if(WITH_GRPC) diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake new file mode 100644 index 0000000000..c94849cf4b --- /dev/null +++ b/cmake/external/cub.cmake @@ -0,0 +1,35 @@ +if(NOT WITH_GPU) + return() +endif() + +include(ExternalProject) + +set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub) +set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub) + +include_directories(${CUB_INCLUDE_DIR}) + +ExternalProject_Add( + extern_cub + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/NVlabs/cub.git" + GIT_TAG "v1.8.0" + PREFIX ${CUB_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(cub STATIC ${dummyfile}) +else() + add_library(cub INTERFACE) +endif() + +add_dependencies(cub extern_cub) + +LIST(APPEND externl_project_dependencies cub) From 2ae32f0bf0ab23a0c53286da25052e09ebfea04d Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Thu, 9 Aug 2018 22:06:40 +0800 Subject: [PATCH 20/22] revert the change of api --- .../tests/unittests/test_dist_transpiler.py | 20 +++++++++---------- .../fluid/transpiler/distribute_transpiler.py | 3 +-- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 1f37e4e91d..abd3721268 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -51,17 +51,17 @@ class TranspilerTest(unittest.TestCase): self.origin_prog = main.clone() return main - def get_trainer(self, config=None): - t = self._transpiler_instance(config) + def get_trainer(self, config=None, sync_mode=True): + t = self._transpiler_instance(config, sync_mode) return t.get_trainer_program() - def get_pserver(self, ep, config=None): - t = self._transpiler_instance(config) + def get_pserver(self, ep, config=None, sync_mode=True): + t = self._transpiler_instance(config, sync_mode) pserver = t.get_pserver_program(ep) startup = t.get_startup_program(ep, pserver) return pserver, startup - def _transpiler_instance(self, config=None): + def _transpiler_instance(self, config=None, sync_mode=True): if not self.transpiler: main = self.get_main_program() self.transpiler = fluid.DistributeTranspiler(config=config) @@ -69,7 +69,8 @@ class TranspilerTest(unittest.TestCase): self.trainer_id, program=main, pservers=self.pserver_eps, - trainers=self.trainers) + trainers=self.trainers, + sync_mode=sync_mode) return self.transpiler @@ -470,8 +471,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): def transpiler_test_impl(self): config = fluid.DistributeTranspilerConfig() - config.sync_mode = False - pserver1, startup1 = self.get_pserver(self.pserver1_ep, config) + pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False) self.assertEqual(len(pserver1.blocks), 3) # 0 listen_and_serv @@ -503,9 +503,8 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): def transpiler_test_impl(self): config = fluid.DistributeTranspilerConfig() - config.sync_mode = False - pserver1, startup1 = self.get_pserver(self.pserver1_ep, config) + pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False) self.assertEqual(len(pserver1.blocks), 6) # 0 listen_and_serv @@ -525,7 +524,6 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): trainer = self.get_trainer(config) self.assertEqual(len(trainer.blocks), 1) - print([op.type for op in trainer.blocks[0].ops]) ops = [ 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul', diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index be96c20db1..820509bbcc 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -124,7 +124,6 @@ class DistributeTranspilerConfig(object): slice_var_up = True split_method = None min_block_size = 8192 - sync_mode = True class DistributeTranspiler(object): @@ -198,7 +197,7 @@ class DistributeTranspiler(object): program = default_main_program() self.origin_program = program self.trainer_num = trainers - self.sync_mode = sync_mode and self.config.sync_mode + self.sync_mode = sync_mode self.trainer_id = trainer_id pserver_endpoints = pservers.split(",") self.pserver_endpoints = pserver_endpoints From 7555cfe33a7291ff8b3ee981cc096e6ebd0cea15 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Fri, 10 Aug 2018 09:09:36 +0800 Subject: [PATCH 21/22] fix inference double free bug (#12613) --- paddle/fluid/inference/api/api.cc | 26 ++++++++++++++++--- .../inference/api/paddle_inference_api.h | 3 ++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index e74f23ff96..63c3f0d7b3 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/inference/api/paddle_inference_api.h" namespace paddle { @@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other) PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { + if (!other.memory_owned_) { + data_ = other.data_; + length_ = other.length_; + memory_owned_ = other.memory_owned_; + } else { + Resize(other.length()); + memcpy(data_, other.data(), other.length()); + length_ = other.length(); + memory_owned_ = true; + } + return *this; +} + +PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { // only the buffer with external memory can be copied - assert(!other.memory_owned_); data_ = other.data_; length_ = other.length_; memory_owned_ = other.memory_owned_; + other.data_ = nullptr; + other.length_ = 0; + other.memory_owned_ = false; return *this; } void PaddleBuf::Resize(size_t length) { // Only the owned memory can be reset, the external memory can't be changed. if (length_ == length) return; - assert(memory_owned_); - Free(); + if (memory_owned_) { + Free(); + } data_ = new char[length]; length_ = length; memory_owned_ = true; @@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Free() { if (memory_owned_ && data_) { assert(length_ > 0); - delete static_cast(data_); + delete[] static_cast(data_); data_ = nullptr; length_ = 0; } diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 59b0df7968..b24414e824 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -40,11 +40,12 @@ class PaddleBuf { // Copy only available when memory is managed externally. explicit PaddleBuf(const PaddleBuf&); PaddleBuf& operator=(const PaddleBuf&); + PaddleBuf& operator=(PaddleBuf&&); // Do not own the memory. PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - explicit PaddleBuf(size_t length) + PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); From 8499559c42be255acded3a079e275a923e683a70 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Fri, 10 Aug 2018 15:14:53 +0800 Subject: [PATCH 22/22] "fix style" (#12600) --- paddle/fluid/operators/elementwise_op_function.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index eb8272e90c..bc3e95e904 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -534,8 +534,8 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, const framework::Tensor& dout, int axis, framework::Tensor* dx, framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) { - const framework::DDim x_dim = x.dims(); - const framework::DDim y_dim = y.dims(); + const framework::DDim& x_dim = x.dims(); + const framework::DDim& y_dim = y.dims(); if (x.dims() == y.dims()) { ElemwiseGradComputeNoBroadcast( ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); @@ -558,19 +558,19 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx, framework::Tensor* dx, framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) { if (dy == nullptr) { - const framework::DDim dx_dims = dout.dims(); + const framework::DDim& dx_dims = dout.dims(); auto dy_dims = dx_dims; ElemwiseGradComputeNoBroadcast( ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } else { if (dout.dims() == dy->dims()) { - const framework::DDim dx_dims = dout.dims(); - const framework::DDim dy_dims = dy->dims(); + const framework::DDim& dx_dims = dout.dims(); + const framework::DDim& dy_dims = dy->dims(); ElemwiseGradComputeNoBroadcast( ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } else { // Y is a scalar auto dx_dims = dout.dims(); - const framework::DDim dy_dims = dy->dims(); + const framework::DDim& dy_dims = dy->dims(); ElemwiseGradComputeWithBroadcast( ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); }