From 67308822f85a387433867ea330624b9c16ae029c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sun, 30 Sep 2018 19:45:05 +0800 Subject: [PATCH 01/15] Add selected_rows merge for clip_by_norm op test=develop --- paddle/fluid/operators/CMakeLists.txt | 3 ++- paddle/fluid/operators/clip_by_norm_op.h | 24 +++++++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index b61bca8c3d..e10fc422fa 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -229,7 +229,7 @@ if(WITH_DISTRIBUTE) op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) endforeach() - + #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op # listen_and_serv_op sum_op executor SERIAL) @@ -267,6 +267,7 @@ if (WITH_GPU AND TENSORRT_FOUND) else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) endif() +op_library(clip_by_norm_op DEPS selected_rows_functor) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) op_library(print_op DEPS lod_tensor) diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 5af0eb0b2a..8346115913 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/transform.h" namespace paddle { @@ -31,10 +32,31 @@ class ClipByNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto max_norm = context.Attr("max_norm"); - auto* input = context.Input("X"); + auto in_var = context.InputVar("X"); auto* output = context.Output("Out"); output->mutable_data(context.GetPlace()); + const Tensor* input = nullptr; + if (in_var->IsType()) { + input = context.Input("X"); + } else if (in_var->IsType()) { + auto* x = context.Input("X"); + + // merge ids in selected rows first + math::scatter::MergeAdd merge_func; + auto* merged_input = const_cast(context.scope()) + .Var() + ->GetMutable(); + merge_func(context.template device_context(), *x, + merged_input); + input = &(merged_input->value()); + } else { + PADDLE_THROW("Unexpected branch, input variable type is %s", + in_var->Type().name()); + } + + PADDLE_ENFORCE_NOT_NULL(input); + auto x = EigenVector::Flatten(*input); auto out = EigenVector::Flatten(*output); auto x_norm = x.square().sum().sqrt(); From f20fc955395a907e68136dd4fccce29660f5d140 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 6 Oct 2018 20:18:09 +0800 Subject: [PATCH 02/15] Resize output ddims and rows --- paddle/fluid/operators/clip_by_norm_op.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 8346115913..7144524a4c 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -33,12 +33,14 @@ class ClipByNormKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto max_norm = context.Attr("max_norm"); auto in_var = context.InputVar("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); + Tensor* output = nullptr; const Tensor* input = nullptr; if (in_var->IsType()) { input = context.Input("X"); + + output = context.Output("Out"); + output->mutable_data(context.GetPlace()); } else if (in_var->IsType()) { auto* x = context.Input("X"); @@ -50,6 +52,11 @@ class ClipByNormKernel : public framework::OpKernel { merge_func(context.template device_context(), *x, merged_input); input = &(merged_input->value()); + + auto* output_selected_rows = context.Output("Out"); + output_selected_rows->set_rows(merged_input.rows()); + output = output_selected_rows->mutable_data(); + output->Resize(framework::make_ddim(merged_input.value().dims())); } else { PADDLE_THROW("Unexpected branch, input variable type is %s", in_var->Type().name()); From bcd8c2ccc35f48a6563715562f525d30ac498e6f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 8 Oct 2018 15:51:36 +0800 Subject: [PATCH 03/15] Add unit test --- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/operators/clip_by_norm_op.h | 22 ++++++----- .../tests/unittests/test_clip_by_norm_op.py | 38 +++++++++++++++++++ 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index e10fc422fa..cafd7b11ae 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -267,7 +267,7 @@ if (WITH_GPU AND TENSORRT_FOUND) else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) endif() -op_library(clip_by_norm_op DEPS selected_rows_functor) +op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) op_library(print_op DEPS lod_tensor) diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 7144524a4c..9f99c8a3f9 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/transform.h" @@ -23,6 +24,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +using SelectedRows = framework::SelectedRows; template using EigenVector = framework::EigenVector; @@ -41,22 +43,24 @@ class ClipByNormKernel : public framework::OpKernel { output = context.Output("Out"); output->mutable_data(context.GetPlace()); - } else if (in_var->IsType()) { - auto* x = context.Input("X"); + } else if (in_var->IsType()) { + auto* x = context.Input("X"); // merge ids in selected rows first math::scatter::MergeAdd merge_func; - auto* merged_input = const_cast(context.scope()) - .Var() - ->GetMutable(); + SelectedRows* merged_input = + const_cast(context.scope()) + .Var() + ->GetMutable(); merge_func(context.template device_context(), *x, merged_input); input = &(merged_input->value()); - auto* output_selected_rows = context.Output("Out"); - output_selected_rows->set_rows(merged_input.rows()); - output = output_selected_rows->mutable_data(); - output->Resize(framework::make_ddim(merged_input.value().dims())); + SelectedRows* output_selected_rows = context.Output("Out"); + output_selected_rows->set_rows(merged_input->rows()); + output_selected_rows->set_height(merged_input->height()); + output = output_selected_rows->mutable_value(); + output->Resize(merged_input->value().dims()); } else { PADDLE_THROW("Unexpected branch, input variable type is %s", in_var->Type().name()); diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py index 6103c3aafc..6556c0875e 100644 --- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py @@ -18,6 +18,8 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core + class TestClipByNormOp(OpTest): def setUp(self): @@ -62,5 +64,41 @@ class TestCase3(TestClipByNormOp): self.max_norm = 1.0 +class TestClipByNormOpWithSelectedRows(OpTest): + def setUp(self): + self.initTestCase() + + self.max_relative_error = 0.006 + + scope = core.Scope() + x_selected_rows = scope.var('X').get_selected_rows() + x_selected_rows.set_rows([1, 1, 2, 0]) + x_tensor = x_selected_rows.get_tensor() + x_tensor = np.random.random((4, 1)).astype("float32") + x_tensor[np.abs(x_tensor) < self.max_relative_error] = 0.5 + + self.op_type = "clip_by_norm" + self.inputs = {'X': x_selected_rows, } + self.attrs = {} + self.attrs['max_norm'] = self.max_norm + y_tensor = np.zeros((3, 1)) + y_tensor[0::1] = np.sum(x_tensor[0::1], x_tensor[1::1]) + y_tensor[1::1] = x_tensor[2::1] + y_tensor[2::1] = x_tensor[3::1] + norm = np.sqrt(np.sum(np.square(y_tensor))) + if norm > self.max_norm: + output = self.max_norm * y_tensor / norm + else: + output = y_tensor + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + def initTestCase(self): + self.shape = (100, ) + self.max_norm = 1.0 + + if __name__ == '__main__': unittest.main() From 1456b8ec7dd7d1a13b7bf3e4d1c14e2a10fb0a38 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 10 Oct 2018 18:53:15 +0800 Subject: [PATCH 04/15] Add unittest for clip_by_norm_op with SelectedRows test=develop --- paddle/fluid/operators/clip_by_norm_op.h | 1 + .../tests/unittests/test_clip_by_norm_op.py | 69 ++++++++++++------- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 9f99c8a3f9..855c4d7067 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -61,6 +61,7 @@ class ClipByNormKernel : public framework::OpKernel { output_selected_rows->set_height(merged_input->height()); output = output_selected_rows->mutable_value(); output->Resize(merged_input->value().dims()); + output->mutable_data(context.GetPlace()); } else { PADDLE_THROW("Unexpected branch, input variable type is %s", in_var->Type().name()); diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py index 6556c0875e..46433d7825 100644 --- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py @@ -18,6 +18,7 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid as fluid import paddle.fluid.core as core @@ -65,39 +66,57 @@ class TestCase3(TestClipByNormOp): class TestClipByNormOpWithSelectedRows(OpTest): - def setUp(self): - self.initTestCase() - - self.max_relative_error = 0.006 - + def check_with_place(self, place): + self.config_test_case() scope = core.Scope() + + # set input x_selected_rows = scope.var('X').get_selected_rows() - x_selected_rows.set_rows([1, 1, 2, 0]) + x_selected_rows.set_rows(self.grad_rows) x_tensor = x_selected_rows.get_tensor() - x_tensor = np.random.random((4, 1)).astype("float32") - x_tensor[np.abs(x_tensor) < self.max_relative_error] = 0.5 - - self.op_type = "clip_by_norm" - self.inputs = {'X': x_selected_rows, } - self.attrs = {} - self.attrs['max_norm'] = self.max_norm - y_tensor = np.zeros((3, 1)) - y_tensor[0::1] = np.sum(x_tensor[0::1], x_tensor[1::1]) - y_tensor[1::1] = x_tensor[2::1] - y_tensor[2::1] = x_tensor[3::1] - norm = np.sqrt(np.sum(np.square(y_tensor))) + x_np = np.random.random(self.grad_shape).astype("float32") + x_np[np.abs(x_np) < self.max_relative_error] = 0.5 + x_tensor.set(x_np, place) + + # set output + out_selected_rows = scope.var('Out').get_selected_rows() + + # run clip_by_norm_op + clip_by_norm_op = fluid.op.Operator( + "clip_by_norm", max_norm=self.max_norm, X='X', Out='Out') + clip_by_norm_op.run(scope, place) + + # check output + self.assertEqual(out_selected_rows.rows(), self.grad_clipped_rows) + out_tensor = out_selected_rows.get_tensor() + y_np = np.zeros(self.grad_clipped_shape) + y_np[0] = np.sum(x_np[0:2]) + y_np[1] = x_np[2] + y_np[2] = x_np[3] + norm = np.sqrt(np.sum(np.square(y_np))) if norm > self.max_norm: - output = self.max_norm * y_tensor / norm + output = self.max_norm * y_np / norm else: - output = y_tensor - self.outputs = {'Out': output} + output = y_np + self.assertTrue( + np.allclose( + np.array(out_tensor), output, atol=1e-5, equal_nan=False)) - def test_check_output(self): - self.check_output() + def test_clip_by_norm_with_selected_ros(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) - def initTestCase(self): - self.shape = (100, ) + for place in places: + self.check_with_place(place) + + def config_test_case(self): self.max_norm = 1.0 + self.max_relative_error = 0.006 + self.grad_shape = (4, 1) + self.grad_clipped_shape = (3, 1) + self.grad_rows = [0, 0, 1, 2] + self.grad_clipped_rows = [0, 1, 2] if __name__ == '__main__': From 40b17be4b0523492435714536de8c329c09925f3 Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Mon, 1 Oct 2018 15:52:32 +0200 Subject: [PATCH 05/15] Pass: Fuse Conv + Bias test=develop --- paddle/fluid/framework/ir/CMakeLists.txt | 6 +- .../ir/conv_bias_mkldnn_fuse_pass.cc | 78 +++++++++++++ .../framework/ir/conv_bias_mkldnn_fuse_pass.h | 34 ++++++ .../ir/conv_bias_mkldnn_fuse_pass_tester.cc | 106 ++++++++++++++++++ .../framework/ir/graph_pattern_detector.cc | 32 ++++++ .../framework/ir/graph_pattern_detector.h | 21 ++++ paddle/fluid/inference/analysis/analyzer.h | 1 + 7 files changed, 276 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0076a8bece..fbfb0776a9 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -30,6 +30,7 @@ pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) if (WITH_MKLDNN) + pass_library(conv_bias_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference) endif () pass_library(attention_lstm_fuse_pass inference) @@ -51,6 +52,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) -if (WITH_MKLDNN) +if(WITH_MKLDNN) + cc_test(test_conv_bias_mkldnn_fuse_pass SRCS conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) -endif () +endif() diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000..d0bd09a4f6 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" +namespace paddle { +namespace framework { +namespace ir { +std::unique_ptr ConvBiasFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("conv_bias_mkldnn_fuse", graph.get()); + GraphPatternDetector gpd; + auto* conv_input = gpd.mutable_pattern() + ->NewNode("conv_bias_mkldnn_fuse/conv_input") + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), + "conv_bias_mkldnn_fuse"); + conv_bias_pattern(conv_input); + int found_conv_bias_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBias fuse"; + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, + conv_bias_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern); // CONV op + // bias + GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern); + // output + GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern); + // elementwise_add op + GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern); + // Create an ConvBias Node. + OpDesc desc; + std::string conv_bias_i_in = subgraph.at(conv_input)->Name(); + std::string conv_bias_w_in = conv_weight->Name(); + std::string conv_bias_b_in = eltwise_bias->Name(); + std::string conv_bias_out = eltwise_out->Name(); + desc.SetInput("Input", std::vector({conv_bias_i_in})); + desc.SetInput("Filter", std::vector({conv_bias_w_in})); + desc.SetInput("Bias", std::vector({conv_bias_b_in})); + desc.SetOutput("Output", std::vector({conv_bias_out})); + desc.SetType("conv2d"); + for (auto& attr : conv->Op()->GetAttrMap()) { + desc.SetAttr(attr.first, attr.second); + } + auto conv_bias_node = g->CreateOpNode(&desc); // OpDesc will be copied. + GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out}); + PADDLE_ENFORCE(subgraph.count(conv_input)); + IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node); + IR_NODE_LINK_TO(conv_weight, conv_bias_node); + IR_NODE_LINK_TO(eltwise_bias, conv_bias_node); + IR_NODE_LINK_TO(conv_bias_node, eltwise_out); + found_conv_bias_count++; + }; + gpd(graph.get(), handler); + AddStatis(found_conv_bias_count); + return graph; +} +} // namespace ir +} // namespace framework +} // namespace paddle +REGISTER_PASS(conv_bias_mkldnn_fuse_pass, + paddle::framework::ir::ConvBiasFusePass); diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h new file mode 100644 index 0000000000..187453b2a6 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" +namespace paddle { +namespace framework { +namespace ir { +/* +* Fuse the Conv and Elementwise_add to a ConvBiasOp. +*/ +class ConvBiasFusePass : public FusePassBase { + public: + virtual ~ConvBiasFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc new file mode 100644 index 0000000000..50fc62c173 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h" + +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "conv2d") { + op->SetAttr("use_mkldnn", true); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + } else if (type == "elementwise_add") { + op->SetInput("X", {inputs[0]}); + op->SetInput("Y", {inputs[1]}); + } + op->SetOutput("Out", outputs); +} + +// a->OP0->b +// b->OP1->c +// (c, weights)->conv->f +// (f, bias)->elementwise_add->g +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : + std::vector({"a", "b", "c", "weights", "bias", "f", "g"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "OP0", std::vector({"a"}), + std::vector({"b"})); + SetOp(&prog, "OP1", std::vector({"b"}), + std::vector({"c"})); + SetOp(&prog, "conv2d", std::vector({"c", "weights"}), + std::vector({"f"})); + SetOp(&prog, "elementwise_add", std::vector({"f", "bias"}), + std::vector({"g"})); + + return prog; +} + +TEST(ConvBiasFusePass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int current_nodes_num = graph->Nodes().size(); + + // Remove 3 Nodes: conv, elementwise_add, conv_out + // Add 1 Node: ConvBias + EXPECT_EQ(original_nodes_num - 2, current_nodes_num); + + // Assert conv_bias op in newly generated graph + int conv_bias_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "conv2d") { + if (node->Op()->HasAttr("use_mkldnn")) { + bool use_mkldnn = boost::get(node->Op()->GetAttr("use_mkldnn")); + if (use_mkldnn) { + auto names = node->Op()->InputNames(); + if (std::find(names.begin(), names.end(), "Bias") != names.end()) { + conv_bias_count++; + } + } + } + } + } + EXPECT_EQ(conv_bias_count, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_bias_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 46c6a52c09..4be1ead0d4 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -858,6 +858,38 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( return ele_add_grad; } +PDNode *patterns::ConvBias::operator()( + paddle::framework::ir::PDNode *conv_input) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + auto *eltiwse_op = + pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); + // Create variables + // Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + // intermediate variable, will be removed in the IR after fuse. + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d") + ->assert_is_op_input("elementwise_add"); + // Bias stored in elementwise_add + auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + // output + auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add"); + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var}) + .LinksTo({eltwise_out_var}); + return eltwise_out_var; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 508113bf4f..60fb13b4f6 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -540,6 +540,27 @@ struct ElewiseAddActInplaceGrad : public PatternBase { PATTERN_DECL_NODE(d_ele_y); PATTERN_DECL_NODE(ele_y); }; + +// Conv with Elementwise_add as bias +// op: conv + elementwise_add +// named nodes: +// conv_input, conv_weight, +// conv_out, conv, +// eltwise_bias, eltwise_out, +// elementwise_add +struct ConvBias : public PatternBase { + ConvBias(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_bias") {} + PDNode* operator()(PDNode* conv_input); + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(eltwise); + // declare variable node's name + PATTERN_DECL_NODE(conv_weight); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(eltwise_bias); + PATTERN_DECL_NODE(eltwise_out); +}; } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 0aa9367bf5..b2bab73d1b 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -74,6 +74,7 @@ class Analyzer : public OrderedRegistry { "seq_concat_fc_fuse_pass", // "fc_fuse_pass", // #ifdef PADDLE_WITH_MKLDNN + "conv_bias_mkldnn_fuse_pass", // "conv_relu_mkldnn_fuse_pass", // #endif }}; From 7e651c8641f8f197aa127a7eef63c2e8eb403a71 Mon Sep 17 00:00:00 2001 From: whs Date: Thu, 11 Oct 2018 10:09:44 +0800 Subject: [PATCH 06/15] Fix truncated norm (#13785) * Fix truncated normal. * test=develop --- paddle/fluid/operators/truncated_gaussian_random_op.cc | 2 +- paddle/fluid/operators/truncated_gaussian_random_op.cu | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc index d854e28039..1e8708f264 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -148,7 +148,7 @@ struct TruncatedNormal { T operator()(T value) const { auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return (std::sqrt(2.0) * Erfinv(2 * p - 1) + mean) * std; + return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; } }; diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu index ad2a9021bf..5a3510babe 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cu +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu @@ -42,7 +42,7 @@ struct TruncatedNormal { rng.discard(n); T value = dist(rng); auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return (std::sqrt(2.0) * erfinvf(2 * p - 1) + mean) * std; + return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; } }; @@ -52,6 +52,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* tensor = context.Output("Out"); T* data = tensor->mutable_data(context.GetPlace()); + unsigned int seed = static_cast(context.Attr("seed")); if (seed == 0) { std::random_device rd; From 9b11a175025cce59f0ba362f53f0c39f3bf35490 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Thu, 11 Oct 2018 11:54:59 +0800 Subject: [PATCH 07/15] Revert "[MKLDNN] Pass: Fuse Conv + Bias" --- paddle/fluid/framework/ir/CMakeLists.txt | 6 +- .../ir/conv_bias_mkldnn_fuse_pass.cc | 78 ------------- .../framework/ir/conv_bias_mkldnn_fuse_pass.h | 34 ------ .../ir/conv_bias_mkldnn_fuse_pass_tester.cc | 106 ------------------ .../framework/ir/graph_pattern_detector.cc | 32 ------ .../framework/ir/graph_pattern_detector.h | 21 ---- paddle/fluid/inference/analysis/analyzer.h | 1 - 7 files changed, 2 insertions(+), 276 deletions(-) delete mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc delete mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h delete mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 79390e9321..796ce1f91c 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -30,7 +30,6 @@ pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) if (WITH_MKLDNN) - pass_library(conv_bias_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference) endif () pass_library(attention_lstm_fuse_pass inference) @@ -53,7 +52,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) -if(WITH_MKLDNN) - cc_test(test_conv_bias_mkldnn_fuse_pass SRCS conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass) +if (WITH_MKLDNN) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) -endif() +endif () diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc deleted file mode 100644 index d0bd09a4f6..0000000000 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h" -#include -#include -#include "paddle/fluid/platform/enforce.h" -namespace paddle { -namespace framework { -namespace ir { -std::unique_ptr ConvBiasFusePass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("conv_bias_mkldnn_fuse", graph.get()); - GraphPatternDetector gpd; - auto* conv_input = gpd.mutable_pattern() - ->NewNode("conv_bias_mkldnn_fuse/conv_input") - ->AsInput() - ->assert_is_op_input("conv2d", "Input"); - patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), - "conv_bias_mkldnn_fuse"); - conv_bias_pattern(conv_input); - int found_conv_bias_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - VLOG(4) << "handle ConvBias fuse"; - GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, - conv_bias_pattern); // Filter - GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp - GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern); // CONV op - // bias - GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern); - // output - GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern); - // elementwise_add op - GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern); - // Create an ConvBias Node. - OpDesc desc; - std::string conv_bias_i_in = subgraph.at(conv_input)->Name(); - std::string conv_bias_w_in = conv_weight->Name(); - std::string conv_bias_b_in = eltwise_bias->Name(); - std::string conv_bias_out = eltwise_out->Name(); - desc.SetInput("Input", std::vector({conv_bias_i_in})); - desc.SetInput("Filter", std::vector({conv_bias_w_in})); - desc.SetInput("Bias", std::vector({conv_bias_b_in})); - desc.SetOutput("Output", std::vector({conv_bias_out})); - desc.SetType("conv2d"); - for (auto& attr : conv->Op()->GetAttrMap()) { - desc.SetAttr(attr.first, attr.second); - } - auto conv_bias_node = g->CreateOpNode(&desc); // OpDesc will be copied. - GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out}); - PADDLE_ENFORCE(subgraph.count(conv_input)); - IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node); - IR_NODE_LINK_TO(conv_weight, conv_bias_node); - IR_NODE_LINK_TO(eltwise_bias, conv_bias_node); - IR_NODE_LINK_TO(conv_bias_node, eltwise_out); - found_conv_bias_count++; - }; - gpd(graph.get(), handler); - AddStatis(found_conv_bias_count); - return graph; -} -} // namespace ir -} // namespace framework -} // namespace paddle -REGISTER_PASS(conv_bias_mkldnn_fuse_pass, - paddle::framework::ir::ConvBiasFusePass); diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h deleted file mode 100644 index 187453b2a6..0000000000 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" -#include "paddle/fluid/framework/ir/pass.h" -namespace paddle { -namespace framework { -namespace ir { -/* -* Fuse the Conv and Elementwise_add to a ConvBiasOp. -*/ -class ConvBiasFusePass : public FusePassBase { - public: - virtual ~ConvBiasFusePass() {} - - protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; -}; -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc deleted file mode 100644 index 50fc62c173..0000000000 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h" - -#include - -namespace paddle { -namespace framework { -namespace ir { - -void SetOp(ProgramDesc* prog, const std::string& type, - const std::vector& inputs, - const std::vector& outputs) { - auto* op = prog->MutableBlock(0)->AppendOp(); - op->SetType(type); - if (type == "conv2d") { - op->SetAttr("use_mkldnn", true); - op->SetInput("Input", {inputs[0]}); - op->SetInput("Filter", {inputs[1]}); - } else if (type == "elementwise_add") { - op->SetInput("X", {inputs[0]}); - op->SetInput("Y", {inputs[1]}); - } - op->SetOutput("Out", outputs); -} - -// a->OP0->b -// b->OP1->c -// (c, weights)->conv->f -// (f, bias)->elementwise_add->g -ProgramDesc BuildProgramDesc() { - ProgramDesc prog; - for (auto& v : - std::vector({"a", "b", "c", "weights", "bias", "f", "g"})) { - auto* var = prog.MutableBlock(0)->Var(v); - var->SetType(proto::VarType::SELECTED_ROWS); - if (v == "weights" || v == "bias") { - var->SetPersistable(true); - } - } - - SetOp(&prog, "OP0", std::vector({"a"}), - std::vector({"b"})); - SetOp(&prog, "OP1", std::vector({"b"}), - std::vector({"c"})); - SetOp(&prog, "conv2d", std::vector({"c", "weights"}), - std::vector({"f"})); - SetOp(&prog, "elementwise_add", std::vector({"f", "bias"}), - std::vector({"g"})); - - return prog; -} - -TEST(ConvBiasFusePass, basic) { - auto prog = BuildProgramDesc(); - - std::unique_ptr graph(new ir::Graph(prog)); - - auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass"); - - int original_nodes_num = graph->Nodes().size(); - - graph = pass->Apply(std::move(graph)); - - int current_nodes_num = graph->Nodes().size(); - - // Remove 3 Nodes: conv, elementwise_add, conv_out - // Add 1 Node: ConvBias - EXPECT_EQ(original_nodes_num - 2, current_nodes_num); - - // Assert conv_bias op in newly generated graph - int conv_bias_count = 0; - - for (auto* node : graph->Nodes()) { - if (node->IsOp() && node->Op()->Type() == "conv2d") { - if (node->Op()->HasAttr("use_mkldnn")) { - bool use_mkldnn = boost::get(node->Op()->GetAttr("use_mkldnn")); - if (use_mkldnn) { - auto names = node->Op()->InputNames(); - if (std::find(names.begin(), names.end(), "Bias") != names.end()) { - conv_bias_count++; - } - } - } - } - } - EXPECT_EQ(conv_bias_count, 1); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -USE_PASS(conv_bias_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index a8364cc05f..8625b562e7 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -964,38 +964,6 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( return ele_add_grad; } -PDNode *patterns::ConvBias::operator()( - paddle::framework::ir::PDNode *conv_input) { - // Create Operators - conv_input->assert_is_op_input("conv2d", "Input"); - auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); - auto *eltiwse_op = - pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); - // Create variables - // Filter - auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) - ->AsInput() - ->assert_is_persistable_var() - ->assert_is_op_input("conv2d", "Filter"); - // intermediate variable, will be removed in the IR after fuse. - auto *conv_out_var = pattern->NewNode(conv_out_repr()) - ->AsIntermediate() - ->assert_is_only_output_of_op("conv2d") - ->assert_is_op_input("elementwise_add"); - // Bias stored in elementwise_add - auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr()) - ->AsInput() - ->assert_is_op_input("elementwise_add", "Y"); - // output - auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr()) - ->AsOutput() - ->assert_is_op_output("elementwise_add"); - conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); - eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var}) - .LinksTo({eltwise_out_var}); - return eltwise_out_var; -} - } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 9dfd7046ca..cdd6413d96 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -578,27 +578,6 @@ struct ElewiseAddActInplaceGrad : public PatternBase { PATTERN_DECL_NODE(d_ele_y); PATTERN_DECL_NODE(ele_y); }; - -// Conv with Elementwise_add as bias -// op: conv + elementwise_add -// named nodes: -// conv_input, conv_weight, -// conv_out, conv, -// eltwise_bias, eltwise_out, -// elementwise_add -struct ConvBias : public PatternBase { - ConvBias(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "conv_bias") {} - PDNode* operator()(PDNode* conv_input); - // declare operator node's name - PATTERN_DECL_NODE(conv); - PATTERN_DECL_NODE(eltwise); - // declare variable node's name - PATTERN_DECL_NODE(conv_weight); - PATTERN_DECL_NODE(conv_out); - PATTERN_DECL_NODE(eltwise_bias); - PATTERN_DECL_NODE(eltwise_out); -}; } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index e7d9cb8994..765145cb7d 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -76,7 +76,6 @@ class Analyzer : public OrderedRegistry { "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", // #ifdef PADDLE_WITH_MKLDNN - "conv_bias_mkldnn_fuse_pass", // "conv_relu_mkldnn_fuse_pass", // #endif }}; From 63b2e98f3d8783624016f56e0000eabf3dbcd02f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 10 Oct 2018 13:22:43 +0800 Subject: [PATCH 08/15] Explain LoD and a few other concepts test=develop --- paddle/fluid/pybind/pybind.cc | 45 +++++++++++++++++++++++++++- python/paddle/fluid/layers/io.py | 6 +++- python/paddle/fluid/layers/tensor.py | 2 +- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 311cd94460..a91894ba89 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -157,7 +157,50 @@ PYBIND11_PLUGIN(core) { .def("_get_double_element", TensorGetElement) .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); }); - py::class_(m, "LoDTensor") + py::class_(m, "LoDTensor", R"DOC( + LoDTensor is a Tensor with optional LoD information. + + np.array(lod_tensor) can convert LoDTensor to numpy array. + lod_tensor.lod() can retrieve the LoD information. + + LoD is short for Level of Details and is usually used for varied sequence + length. You can skip the following comment if you don't need optional LoD. + + For example: + A LoDTensor X can look like the example below. It contains 2 sequences. + The first has length 2 and the second has length 3, as described by x.lod. + + The first tensor dimension 6=2+3 is calculated from LoD if it's available. + It means the total number of sequence element. In X, each element has 2 + columns, hence [6, 2]. + + x.lod = [[2, 3]] + x.data = [[1, 2], [3, 4], + [5, 6], [7, 8], [9, 10], [11, 12]] + x.shape = [6, 2] + + LoD can have multiple levels (for example, a paragraph can have multiple + sentences and a sentence can have multiple words). In the following + LodTensor Y, the lod_level is 2. It means there are 2 sequence, the + first sequence length is 2 (has 2 sub-sequences), the second one's + length is 1. The first sequence's 2 sub-sequences have length 2 and 2, + respectively. And the second sequence's 1 sub-sequence has length 3. + + y.lod = [[2 1], [2 2 3]] + y.shape = [2+2+3, ...] + + Note: + In above description, LoD is length-based. In Paddle internal + implementation, lod is offset-based. Hence, internally, + y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based + equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]). + + Sometimes LoD is called recursive_sequence_length to be more + self-explanatory. In this case, it must be length-based. Due to history + reasons. when LoD is called lod in public API, it might be offset-based. + Users should be careful about it. + + )DOC") .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) .def("__init__", diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 25fde782b7..a06cd4982f 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -56,7 +56,11 @@ def data(name, Args: name(str): The name/alias of the function shape(list): Tuple declaring the shape. - append_batch_size(bool): Whether or not to append the data as a batch. + append_batch_size(bool): + 1. If true, it prepends -1 to the shape. + For example if shape=[1], the resulting shape is [-1, 1]. + 2. If shape contains -1, such as shape=[1, -1], + append_batch_size will be enforced to be be False (ineffective). dtype(int|float): The type of data : float32, float_16, int etc type(VarType): The output type. By default it is LOD_TENSOR. lod_level(int): The LoD Level. 0 means the input data is not a sequence. diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 44b92af7ac..9c6a2112a6 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -100,7 +100,7 @@ def create_global_var(shape, force_cpu=False, name=None): """ - Create a new variable in the global block(block 0). + Create a new tensor variable with value in the global block(block 0). Args: shape(list[int]): shape of the variable From 5428cb9908740f2581112777e55a3f118b5b93d3 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 12 Oct 2018 09:25:14 +0800 Subject: [PATCH 09/15] Profiler support merge data of all thread (#13811) * profiler infor merge thread statistic information * update profiler * fix bug * add merge thread msg to report * optimize report * statistic the time of ops in each thread but not all * optimize report format * optimize profile report * optimize profile report test=develop --- paddle/fluid/platform/profiler.cc | 65 ++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 652a6ec7a4..612f3bc0e7 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -276,7 +276,7 @@ struct EventItem { // Print results void PrintProfiler(const std::vector>& events_table, const std::string& sorted_domain, const size_t name_width, - const size_t data_width, double total) { + const size_t data_width, bool merge_thread) { // Output header information std::cout << "\n------------------------->" << " Profiling Report " @@ -292,6 +292,10 @@ void PrintProfiler(const std::vector>& events_table, PADDLE_THROW("Invalid profiler state", g_state); } + if (merge_thread) { + std::cout << "Note! This Report merge all thread info into one." + << std::endl; + } std::cout << "Place: " << place << std::endl; std::cout << "Time unit: ms" << std::endl; std::cout << "Sorted by " << sorted_domain @@ -312,8 +316,7 @@ void PrintProfiler(const std::vector>& events_table, << std::setw(data_width) << event_item.min_time << std::setw(data_width) << event_item.max_time << std::setw(data_width) << event_item.ave_time - << std::setw(data_width) << event_item.total_time / total - << std::endl; + << std::setw(data_width) << event_item.ratio << std::endl; } } std::cout << std::endl; @@ -321,8 +324,10 @@ void PrintProfiler(const std::vector>& events_table, // Parse the event list and output the profiling report void ParseEvents(const std::vector>& events, + bool merge_thread, EventSortingKey sorted_by = EventSortingKey::kDefault) { if (g_state == ProfilerState::kDisabled) return; + if (merge_thread && events.size() < 2) return; std::string sorted_domain; std::function sorted_func; @@ -361,34 +366,55 @@ void ParseEvents(const std::vector>& events, sorted_domain = "event first end time"; } + const std::vector>* analyze_events; + std::vector> merged_events_list; + if (merge_thread) { + std::vector merged_events; + for (int i = 0; i < events.size(); ++i) { + for (int j = 0; j < events[i].size(); ++j) { + merged_events.push_back(events[i][j]); + } + } + merged_events_list.push_back(merged_events); + analyze_events = &merged_events_list; + } else { + analyze_events = &events; + } + std::vector> events_table; size_t max_name_width = 0; - double total = 0.; // the total time - for (size_t i = 0; i < events.size(); i++) { + for (size_t i = 0; i < (*analyze_events).size(); i++) { + double total = 0.; // the total time in one thread std::list pushed_events; std::vector event_items; std::unordered_map event_idx; - for (size_t j = 0; j < events[i].size(); j++) { - if (events[i][j].type() == EventType::kPushRange) { - pushed_events.push_back(events[i][j]); - } else if (events[i][j].type() == EventType::kPopRange) { + for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { + if ((*analyze_events)[i][j].type() == EventType::kPushRange) { + pushed_events.push_back((*analyze_events)[i][j]); + } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) { std::list::reverse_iterator rit = pushed_events.rbegin(); while (rit != pushed_events.rend() && - rit->name() != events[i][j].name()) { + rit->name() != (*analyze_events)[i][j].name()) { ++rit; } if (rit != pushed_events.rend()) { double event_time = (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll) - ? rit->CudaElapsedMs(events[i][j]) - : rit->CpuElapsedMs(events[i][j]); + ? rit->CudaElapsedMs((*analyze_events)[i][j]) + : rit->CpuElapsedMs((*analyze_events)[i][j]); total += event_time; - std::string event_name = - "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); - max_name_width = std::max(max_name_width, event_name.size()); + std::string event_name; + if (merge_thread) { + event_name = rit->name(); + max_name_width = std::max(max_name_width, event_name.size()); + } else { + event_name = "thread" + std::to_string(rit->thread_id()) + "::" + + rit->name(); + max_name_width = std::max(max_name_width, event_name.size()); + } if (event_idx.find(event_name) == event_idx.end()) { event_idx[event_name] = event_items.size(); @@ -413,7 +439,7 @@ void ParseEvents(const std::vector>& events, pushed_events.erase((++rit).base()); } else { LOG(WARNING) << "Cannot find the push marker of event \'" - << events[i][j].name() + << (*analyze_events)[i][j].name() << "\', which will be ignored in profiling report."; } } @@ -421,6 +447,7 @@ void ParseEvents(const std::vector>& events, // average time for (auto& item : event_items) { item.ave_time = item.total_time / item.calls; + item.ratio = item.total_time / total; } // sort if (sorted_by != EventSortingKey::kDefault) { @@ -438,7 +465,8 @@ void ParseEvents(const std::vector>& events, } // Print report - PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, total); + PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, + merge_thread); } void DisableProfiler(EventSortingKey sorted_key, @@ -449,7 +477,8 @@ void DisableProfiler(EventSortingKey sorted_key, Mark("_stop_profiler_", nullptr); std::vector> all_events = GetAllEvents(); - ParseEvents(all_events, sorted_key); + ParseEvents(all_events, true, sorted_key); + ParseEvents(all_events, false, sorted_key); ResetProfiler(); DeviceTracer* tracer = GetDeviceTracer(); if (tracer->IsEnabled()) { From 5f2e837847db9fff219333e03f4867abaa75768c Mon Sep 17 00:00:00 2001 From: Dun Date: Fri, 12 Oct 2018 13:24:57 +0800 Subject: [PATCH 10/15] optimize depthwise conv by register memory (#13778) * optimize depthwise conv by register memory * test=develop --- paddle/fluid/operators/math/depthwise_conv.cu | 275 +++++++++++++----- 1 file changed, 210 insertions(+), 65 deletions(-) diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 3be3899123..66d37c3bf3 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -46,17 +46,20 @@ __forceinline__ __device__ unsigned warp_id() { return ret; } +#define ARG_DEFINE_KernelDepthwiseConv \ + const T *const input_data, const T *const filter_data, const int batch_size, \ + const int output_channels, const int output_height, \ + const int output_width, const int input_channels, \ + const int input_height, const int input_width, \ + const int filter_multiplier, const int filter_height, \ + const int filter_width, const int stride_height, const int stride_width, \ + const int padding_height, const int padding_width, \ + const int dilate_height, const int dilate_width, T *const output_data + // A Cuda kernel to compute the depthwise convolution forward pass // in NCHW format. template -__device__ __inline__ void KernelDepthwiseConv( - const T* const input_data, const T* const filter_data, const int batch_size, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const output_data) { +__device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) { for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { const int batch = blockIdx.y; @@ -97,42 +100,105 @@ __device__ __inline__ void KernelDepthwiseConv( } } -template -__global__ void KernelDepthwiseConvSp( - const T* const input_data, const T* const filter_data, const int batch_size, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const output_data) { - if (c_filter_multiplier == 0) - KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, - input_height, input_width, filter_multiplier, - filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, - dilate_height, dilate_width, output_data); +template +__device__ __inline__ void KernelDepthwiseConvCFilter( + ARG_DEFINE_KernelDepthwiseConv) { + const int kWeghtSize = c_filter * c_filter; + T r_weight[kWeghtSize]; + const int batch = blockIdx.y; + const int c_out = blockIdx.x; + const T* weight = filter_data + c_out * c_filter * c_filter; + for (int i = 0; i < c_filter * c_filter; i++) r_weight[i] = weight[i]; - else - KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, - input_height, input_width, c_filter_multiplier, - filter_height, filter_height, c_stride, c_stride, - padding_height, padding_width, dilate_height, - dilate_width, output_data); + for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { + for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { + const int batch = blockIdx.y; + const int c_out = blockIdx.x; + + const int c_in = c_out / filter_multiplier; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + c_filter * dilate_height; + const int w_in_end = w_in_start + c_filter * dilate_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_in_start, h_f = 0; h_f < c_filter; + h_in += dilate_height, h_f++) { + for (int w_in = w_in_start, w_f = 0; w_f < c_filter; + w_in += dilate_width, w_f++) { + if (h_in >= 0 && h_in < input_height && w_in >= 0 && + w_in < input_width) { + const int offset = in_offset + h_in * input_width + w_in; + value += r_weight[h_f * c_filter + w_f] * input_data[offset]; + } + } + } + int index = + ((batch * gridDim.x + c_out) * output_height + h_out) * output_width + + w_out; + output_data[index] = value; + } + } +} + +template +__global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { + if (c_filter_multiplier == 0) { + if (c_filter == -1) + KernelDepthwiseConv( + input_data, filter_data, batch_size, output_channels, output_height, + output_width, input_channels, input_height, input_width, + filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, output_data); + else + KernelDepthwiseConvCFilter( + input_data, filter_data, batch_size, output_channels, output_height, + output_width, input_channels, input_height, input_width, + filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, output_data); + } else { + if (c_filter == -1) + KernelDepthwiseConv(input_data, filter_data, batch_size, + output_channels, output_height, output_width, + input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_height, + c_stride, c_stride, padding_height, padding_width, + dilate_height, dilate_width, output_data); + else + KernelDepthwiseConvCFilter( + input_data, filter_data, batch_size, output_channels, output_height, + output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_height, c_stride, c_stride, + padding_height, padding_width, dilate_height, dilate_width, + output_data); + } } // CUDA kernel to compute the depthwise convolution backprop w.r.t input. +#define ARG_DEFINE_KernelDepthwiseConvInputGrad \ + const T *const output_grad_data, const T *const filter_data, \ + const int batch_size, const int output_channels, \ + const int output_height, const int output_width, \ + const int input_channels, const int input_height, const int input_width, \ + const int filter_multiplier, const int filter_height, \ + const int filter_width, const int stride_height, const int stride_width, \ + const int padding_height, const int padding_width, \ + const int dilate_height, const int dilate_width, \ + T *const input_grad_data + template __device__ __inline__ void KernelDepthwiseConvInputGrad( - const T* const output_grad_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const input_grad_data) { + ARG_DEFINE_KernelDepthwiseConvInputGrad) { for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { const int batch = blockIdx.y; @@ -184,15 +250,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad( } } -template +template +__device__ __inline__ void KernelDepthwiseConvInputGradCFilter( + ARG_DEFINE_KernelDepthwiseConvInputGrad) { + const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1; + T r_weight[kWeghtSize]; + const int batch = blockIdx.y; + const int c_in = blockIdx.x; + + for (int c_i = 0; c_i < filter_multiplier; c_i++) { + int c_out = c_in * filter_multiplier + c_i; + const T* weight = filter_data + c_out * c_filter * c_filter; + for (int i = 0; i < c_filter * c_filter; i++) + r_weight[i + c_i * c_filter * c_filter] = + weight[c_filter * c_filter - i - 1]; + } + + for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { + for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { + const int batch = blockIdx.y; + const int c_in = blockIdx.x; + + int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height; + + int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; + + T value = 0; + + for (int c_i = 0; c_i < filter_multiplier; c_i++) { + int c_out = c_in * filter_multiplier + c_i; + for (int h_out = h_out_start, h_f = 0; h_f < c_filter; + h_out += dilate_height, h_f++) { + for (int w_out = w_out_start, w_f = 0; w_f < c_filter; + w_out += dilate_width, w_f++) { + int s_h_out = h_out / stride_height; + int s_w_out = w_out / stride_width; + if (h_out % stride_height == 0 && w_out % stride_width == 0 && + s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && + s_w_out < output_width) { + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + + s_h_out) * + output_width + + s_w_out; + value += + output_grad_data[output_grad_offset] * + r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter]; + } + } + } + } + int index = + ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + + w_in; + input_grad_data[index] = value; + } + } +} + +template __global__ void KernelDepthwiseConvInputGradSp( - const T* const output_grad_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const input_grad_data) { + ARG_DEFINE_KernelDepthwiseConvInputGrad) { if (c_filter_multiplier == 0) KernelDepthwiseConvInputGrad( output_grad_data, filter_data, batch_size, output_channels, @@ -200,13 +318,20 @@ __global__ void KernelDepthwiseConvInputGradSp( filter_multiplier, filter_height, filter_width, stride_height, stride_width, padding_height, padding_width, dilate_height, dilate_width, input_grad_data); - else + else if (c_filter == -1) KernelDepthwiseConvInputGrad( output_grad_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, padding_height, padding_width, dilate_height, dilate_width, input_grad_data); + else + KernelDepthwiseConvInputGradCFilter( + output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, + padding_height, padding_width, dilate_height, dilate_width, + input_grad_data); } // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. @@ -325,12 +450,14 @@ class DepthwiseConvFunctor { dim3 threads(std::min(output_width, thread), blocks, 1); dim3 grid(output_channels, batch_size, 1); int filter_multiplier = output_channels / input_channels; -#define check_case(c_filter_multiplier, c_stride) \ +#define check_case(c_filter_multiplier, c_stride, c_filter) \ if (c_filter_multiplier == 0 || \ filter_multiplier == c_filter_multiplier && \ - stride_height == stride_width && stride_height == c_stride) { \ - KernelDepthwiseConvSp<<>>( \ + stride_height == stride_width && stride_height == c_stride && \ + (ksize_height == ksize_width && ksize_height == c_filter || \ + c_filter == -1)) { \ + KernelDepthwiseConvSp<<>>( \ input_data, filter_data, batch_size, output_channels, output_height, \ output_width, input_channels, input_height, input_width, \ filter_multiplier, ksize_height, ksize_width, stride_height, \ @@ -338,11 +465,17 @@ class DepthwiseConvFunctor { dilate_width, output_data); \ return; \ } - check_case(1, 1); - check_case(1, 2); - // NOTE(liangdun): 0,0 for other case - // add other case if needed, e.g. check_case(2^n,1) - check_case(0, 0); + check_case(1, 1, 3); + check_case(1, 1, 5); + check_case(1, 1, -1); + check_case(1, 2, 3); + check_case(1, 2, 5); + check_case(1, 2, -1); + check_case(0, 0, 3); + check_case(0, 0, 5); + check_case(0, 0, -1); +// NOTE(liangdun): 0,0 for other case +// add other case if needed, e.g. check_case(2^n,1) #undef check_case } }; @@ -384,13 +517,15 @@ class DepthwiseConvInputGradFunctor { dim3 grid(input_channels, batch_size, 1); int filter_multiplier = output_channels / input_channels; -#define check_case(c_filter_multiplier, c_stride) \ +#define check_case(c_filter_multiplier, c_stride, c_filter) \ if (c_filter_multiplier == 0 || \ filter_multiplier == c_filter_multiplier && \ - stride_height == stride_width && stride_height == c_stride) { \ + stride_height == stride_width && stride_height == c_stride && \ + (ksize_height == ksize_width && ksize_height == c_filter || \ + c_filter == -1)) { \ KernelDepthwiseConvInputGradSp< \ - T, c_filter_multiplier, \ - c_stride><<>>( \ + T, c_filter_multiplier, c_stride, \ + c_filter><<>>( \ output_grad_data, filter_data, batch_size, output_channels, \ output_height, output_width, input_channels, input_height, \ input_width, filter_multiplier, ksize_height, ksize_width, \ @@ -398,11 +533,21 @@ class DepthwiseConvInputGradFunctor { dilate_height, dilate_width, input_grad_data); \ return; \ } - check_case(1, 1); - check_case(1, 2); - // NOTE(liangdun): 0,0 for other case - // add other case if needed, e.g. check_case(2^n,1) - check_case(0, 0); + check_case(1, 1, 3); + check_case(1, 1, 5); + check_case(1, 1, -1); + check_case(1, 2, 3); + check_case(1, 2, 5); + check_case(1, 2, -1); + check_case(2, 1, 3); + check_case(2, 1, 5); + check_case(2, 1, -1); + check_case(2, 2, 3); + check_case(2, 2, 5); + check_case(2, 2, -1); + check_case(0, 0, -1); +// NOTE(liangdun): 0,0 for other case +// add other case if needed, e.g. check_case(2^n,1) #undef check_case } }; From 228506618b23845792d7f6de74cb6b97cd7bfb13 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 12 Oct 2018 14:41:57 +0800 Subject: [PATCH 11/15] Avoid GetMutable implicitly reset Var Type. This can cause a lot of problem: 1. Wrong operator implementation, Op can get a wrong type without failure. 2. Anytype can be Get without defined in VarType. Also fix wrong STEP_SCOPE usage. test=develop --- paddle/fluid/framework/executor.cc | 2 +- paddle/fluid/framework/feed_fetch_method.cc | 3 +-- paddle/fluid/framework/naive_executor.cc | 2 +- paddle/fluid/framework/variable.h | 6 +++++- paddle/fluid/framework/variable_test.cc | 11 ++++++----- python/paddle/fluid/layers/io.py | 6 +++++- python/paddle/fluid/layers/tensor.py | 2 +- python/paddle/fluid/tests/book/test_word2vec.py | 16 ++-------------- 8 files changed, 22 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 70ec6e90a4..a070b8efb8 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -66,7 +66,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 8e1f93c5eb..3e9353f5cf 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, // be created. VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; Variable* g_feed_value = scope->Var(var_name); - auto& feed_inputs = - *(g_feed_value->GetMutable>()); + auto& feed_inputs = *(g_feed_value->GetMutable()); if (index >= feed_inputs.size()) { feed_inputs.resize(index + 1); } diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index ba10687d65..2840d503f1 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 067e0c2b83..873e1b20a5 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -38,8 +38,12 @@ class Variable { template T* GetMutable() { - if (!IsType()) { + if (!holder_) { holder_.reset(new PlaceholderImpl(new T())); + } else { + PADDLE_ENFORCE(IsType(), + "Variable must be type %s, the holding type is %s", + typeid(T).name(), holder_->Type().name()); } return static_cast(holder_->Ptr()); } diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc index c5c1d215f4..003dcfd3df 100644 --- a/paddle/fluid/framework/variable_test.cc +++ b/paddle/fluid/framework/variable_test.cc @@ -33,9 +33,10 @@ TEST(Variable, GetMutable) { const Tensor& tt = v->Get(); EXPECT_EQ(1234, tt.content_); - std::string* s = v->GetMutable(); - *s = "hello"; - - const std::string& ss = v->Get(); - EXPECT_EQ("hello", ss); + try { + v->GetMutable(); + } catch (std::exception& e) { + return; + } + EXPECT_TRUE(false); } diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 25fde782b7..a06cd4982f 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -56,7 +56,11 @@ def data(name, Args: name(str): The name/alias of the function shape(list): Tuple declaring the shape. - append_batch_size(bool): Whether or not to append the data as a batch. + append_batch_size(bool): + 1. If true, it prepends -1 to the shape. + For example if shape=[1], the resulting shape is [-1, 1]. + 2. If shape contains -1, such as shape=[1, -1], + append_batch_size will be enforced to be be False (ineffective). dtype(int|float): The type of data : float32, float_16, int etc type(VarType): The output type. By default it is LOD_TENSOR. lod_level(int): The LoD Level. 0 means the input data is not a sequence. diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 44b92af7ac..9c6a2112a6 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -100,7 +100,7 @@ def create_global_var(shape, force_cpu=False, name=None): """ - Create a new variable in the global block(block 0). + Create a new tensor variable with value in the global block(block 0). Args: shape(list[int]): shape of the variable diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 9191f0fc20..1f3a230048 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -17,7 +17,6 @@ from __future__ import print_function import paddle import paddle.fluid as fluid from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo import unittest import os import numpy as np @@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): avg_cost, predict_word = __network__( [first_word, second_word, third_word, forth_word, next_word]) else: - places = get_places() - pd = ParallelDo(places) - with pd.do(): - avg_cost, predict_word = __network__( - list( - map(pd.read_input, [ - first_word, second_word, third_word, forth_word, - next_word - ]))) - pd.write_output(avg_cost) - - avg_cost = fluid.layers.mean(pd()) + raise ValueError('is_parallel=True not implemented') sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) @@ -262,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel): for use_cuda in (False, True): for is_sparse in (False, True): - for is_parallel in (False, True): + for is_parallel in (False, ): # TODO(paddle-dev): Add parallel test. inject_test_method(use_cuda, is_sparse, is_parallel) if __name__ == '__main__': From efa5bac7ad4f1498a5ee4d340d550ea8b17fe9bf Mon Sep 17 00:00:00 2001 From: nhzlx Date: Fri, 12 Oct 2018 07:08:15 +0000 Subject: [PATCH 12/15] fix demo_ci bug in vis_demo.cc test=develop --- paddle/fluid/inference/api/demo_ci/run.sh | 8 +- .../api/demo_ci/trt_mobilenet_demo.cc | 88 +++++++++++++++ paddle/fluid/inference/api/demo_ci/utils.h | 59 ++++++++++ .../fluid/inference/api/demo_ci/vis_demo.cc | 105 +++--------------- 4 files changed, 164 insertions(+), 96 deletions(-) create mode 100644 paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 76238070cd..65c95f0834 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -100,19 +100,17 @@ for WITH_STATIC_LIB in ON OFF; do rm -rf * cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ -DWITH_MKL=$TURN_ON_MKL \ - -DDEMO_NAME=vis_demo \ + -DDEMO_NAME=trt_mobilenet_demo \ -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DUSE_TENSORRT=$USE_TENSORRT \ -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \ -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR make -j - ./vis_demo \ + ./trt_mobilenet_demo \ --modeldir=$DATA_DIR/mobilenet/model \ --data=$DATA_DIR/mobilenet/data.txt \ - --refer=$DATA_DIR/mobilenet/result.txt \ - --use_gpu=true \ - --use_trt=true + --refer=$DATA_DIR/mobilenet/result.txt fi done set +x diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc new file mode 100644 index 0000000000..4377627859 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains demo of mobilenet for tensorrt. + */ + +#include +#include // use glog instead of CHECK to avoid importing other paddle header files. +#include +#include + +// #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/inference/demo_ci/utils.h" + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +#endif +DEFINE_string(modeldir, "", "Directory of the inference model."); +DEFINE_string(refer, "", "path to reference result for comparison."); +DEFINE_string( + data, "", + "path of data; each line is a record, format is " + "'\t predictor; + paddle::contrib::MixedRTConfig config; + config.param_file = FLAGS_modeldir + "/__params__"; + config.prog_file = FLAGS_modeldir + "/__model__"; + config.use_gpu = true; + config.device = 0; + config.max_batch_size = 1; + config.fraction_of_gpu_memory = 0.1; // set by yourself + predictor = CreatePaddlePredictor(config); + + VLOG(3) << "begin to process data"; + // Just a single batch of data. + std::string line; + std::ifstream file(FLAGS_data); + std::getline(file, line); + auto record = ProcessALine(line); + file.close(); + + // Inference. + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; + + VLOG(3) << "run executor"; + std::vector output; + predictor->Run({input}, &output, 1); + + VLOG(3) << "output.size " << output.size(); + auto& tensor = output.front(); + VLOG(3) << "output: " << SummaryTensor(tensor); + + // compare with reference result + CheckOutput(FLAGS_refer, tensor); +} + +} // namespace demo +} // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(); + return 0; +} diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h index cb89906711..4792c97fe7 100644 --- a/paddle/fluid/inference/api/demo_ci/utils.h +++ b/paddle/fluid/inference/api/demo_ci/utils.h @@ -14,6 +14,8 @@ #pragma once #include +#include +#include #include #include #include "paddle/fluid/inference/paddle_inference_api.h" @@ -21,6 +23,11 @@ namespace paddle { namespace demo { +struct Record { + std::vector data; + std::vector shape; +}; + static void split(const std::string& str, char sep, std::vector* pieces) { pieces->clear(); @@ -39,6 +46,58 @@ static void split(const std::string& str, char sep, } } +Record ProcessALine(const std::string& line) { + VLOG(3) << "process a line"; + std::vector columns; + split(line, '\t', &columns); + CHECK_EQ(columns.size(), 2UL) + << "data format error, should be \t"; + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto& d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto& s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + VLOG(3) << "data size " << record.data.size(); + VLOG(3) << "data shape size " << record.shape.size(); + return record; +} + +void CheckOutput(const std::string& referfile, const PaddleTensor& output) { + std::string line; + std::ifstream file(referfile); + std::getline(file, line); + auto refer = ProcessALine(line); + file.close(); + + size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); + VLOG(3) << "predictor output numel " << numel; + VLOG(3) << "reference output numel " << refer.data.size(); + CHECK_EQ(numel, refer.data.size()); + switch (output.dtype) { + case PaddleDType::INT64: { + for (size_t i = 0; i < numel; ++i) { + CHECK_EQ(static_cast(output.data.data())[i], refer.data[i]); + } + break; + } + case PaddleDType::FLOAT32: + for (size_t i = 0; i < numel; ++i) { + CHECK_LT( + fabs(static_cast(output.data.data())[i] - refer.data[i]), + 1e-5); + } + break; + } +} + /* * Get a summary of a PaddleTensor content. */ diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index b9d627b4a5..db61786e2f 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -18,10 +18,6 @@ limitations under the License. */ #include #include // use glog instead of CHECK to avoid importing other paddle header files. -#include -#include - -// #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/inference/demo_ci/utils.h" #ifdef PADDLE_WITH_CUDA @@ -34,99 +30,28 @@ DEFINE_string( "path of data; each line is a record, format is " "'\t data; - std::vector shape; -}; - -void split(const std::string& str, char sep, std::vector* pieces); - -Record ProcessALine(const std::string& line) { - VLOG(3) << "process a line"; - std::vector columns; - split(line, '\t', &columns); - CHECK_EQ(columns.size(), 2UL) - << "data format error, should be \t"; - - Record record; - std::vector data_strs; - split(columns[0], ' ', &data_strs); - for (auto& d : data_strs) { - record.data.push_back(std::stof(d)); - } - - std::vector shape_strs; - split(columns[1], ' ', &shape_strs); - for (auto& s : shape_strs) { - record.shape.push_back(std::stoi(s)); - } - VLOG(3) << "data size " << record.data.size(); - VLOG(3) << "data shape size " << record.shape.size(); - return record; -} - -void CheckOutput(const std::string& referfile, const PaddleTensor& output) { - std::string line; - std::ifstream file(referfile); - std::getline(file, line); - auto refer = ProcessALine(line); - file.close(); - - size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); - VLOG(3) << "predictor output numel " << numel; - VLOG(3) << "reference output numel " << refer.data.size(); - CHECK_EQ(numel, refer.data.size()); - switch (output.dtype) { - case PaddleDType::INT64: { - for (size_t i = 0; i < numel; ++i) { - CHECK_EQ(static_cast(output.data.data())[i], refer.data[i]); - } - break; - } - case PaddleDType::FLOAT32: - for (size_t i = 0; i < numel; ++i) { - CHECK_LT( - fabs(static_cast(output.data.data())[i] - refer.data[i]), - 1e-5); - } - break; - } -} - /* * Use the native fluid engine to inference the demo. */ -void Main(bool use_gpu, bool use_trt) { +void Main(bool use_gpu) { std::unique_ptr predictor; - if (!use_trt) { - NativeConfig config; - config.param_file = FLAGS_modeldir + "/__params__"; - config.prog_file = FLAGS_modeldir + "/__model__"; - config.use_gpu = use_gpu; - config.device = 0; - if (FLAGS_use_gpu) { - config.fraction_of_gpu_memory = 0.1; // set by yourself - } - - VLOG(3) << "init predictor"; - predictor = - CreatePaddlePredictor(config); - } else { - paddle::contrib::MixedRTConfig config; - config.param_file = FLAGS_modeldir + "/__params__"; - config.prog_file = FLAGS_modeldir + "/__model__"; - config.use_gpu = true; - config.device = 0; - config.max_batch_size = 1; + NativeConfig config; + config.param_file = FLAGS_modeldir + "/__params__"; + config.prog_file = FLAGS_modeldir + "/__model__"; + config.use_gpu = use_gpu; + config.device = 0; + if (FLAGS_use_gpu) { config.fraction_of_gpu_memory = 0.1; // set by yourself - predictor = CreatePaddlePredictor(config); } + VLOG(3) << "init predictor"; + predictor = + CreatePaddlePredictor(config); + VLOG(3) << "begin to process data"; // Just a single batch of data. std::string line; @@ -159,12 +84,10 @@ void Main(bool use_gpu, bool use_trt) { int main(int argc, char** argv) { google::ParseCommandLineFlags(&argc, &argv, true); - if (FLAGS_use_gpu && FLAGS_use_trt) { - paddle::demo::Main(true /*use_gpu*/, true); - } else if (FLAGS_use_gpu) { - paddle::demo::Main(true /*use_gpu*/, false); + if (FLAGS_use_gpu) { + paddle::demo::Main(true /*use_gpu*/); } else { - paddle::demo::Main(false /*use_gpu*/, false /*use_tensorrt*/); + paddle::demo::Main(false /*use_gpu*/); } return 0; } From 320c78e16f96dd84e72db45d0a21f79581d9fcc5 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Fri, 12 Oct 2018 11:28:38 +0000 Subject: [PATCH 13/15] fix commets test=develop --- paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 4377627859..ffb12b5871 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -18,15 +18,9 @@ limitations under the License. */ #include #include // use glog instead of CHECK to avoid importing other paddle header files. -#include -#include - -// #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/inference/demo_ci/utils.h" -#ifdef PADDLE_WITH_CUDA DECLARE_double(fraction_of_gpu_memory_to_use); -#endif DEFINE_string(modeldir, "", "Directory of the inference model."); DEFINE_string(refer, "", "path to reference result for comparison."); DEFINE_string( @@ -38,7 +32,7 @@ namespace paddle { namespace demo { /* - * Use the native fluid engine to inference the demo. + * Use the tensorrt fluid engine to inference the demo. */ void Main() { std::unique_ptr predictor; From 9c77b65c06af0a2a95e244074f5ad7afcb9dc5e8 Mon Sep 17 00:00:00 2001 From: chengduo Date: Fri, 12 Oct 2018 19:39:44 +0800 Subject: [PATCH 14/15] Fix layers.uniform_random (#13823) * fix layers.uniform_random * fix uniform_random test=develop * remove var type set test=develop * fix similar error test=develop --- paddle/fluid/operators/uniform_random_op.cc | 32 ++++++++++----------- python/paddle/fluid/layers/ops.py | 17 +++++++---- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index 763bb40358..aa907595cb 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -23,14 +23,14 @@ namespace operators { template class CPUUniformRandomKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - framework::Tensor* tensor = nullptr; + void Compute(const framework::ExecutionContext &ctx) const override { + framework::Tensor *tensor = nullptr; auto out_var = ctx.OutputVar("Out"); if (out_var->IsType()) { tensor = out_var->GetMutable(); } else if (out_var->IsType()) { auto shape = ctx.Attr>("shape"); - auto* selected_rows = out_var->GetMutable(); + auto *selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); tensor->Resize(framework::make_ddim(shape)); selected_rows->mutable_rows()->reserve(shape[0]); @@ -39,7 +39,7 @@ class CPUUniformRandomKernel : public framework::OpKernel { "uniform_random_op's output only" "supports SelectedRows and LoDTensor"); } - T* data = tensor->mutable_data(ctx.GetPlace()); + T *data = tensor->mutable_data(ctx.GetPlace()); unsigned int seed = static_cast(ctx.Attr("seed")); std::minstd_rand engine; if (seed == 0) { @@ -60,14 +60,14 @@ class UniformRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of UniformRandomOp should not be null."); PADDLE_ENFORCE( ctx->Attrs().Get("min") < ctx->Attrs().Get("max"), "uniform_random's min must less then max"); - auto& shape = ctx->Attrs().Get>("shape"); + auto &shape = ctx->Attrs().Get>("shape"); std::vector temp; temp.reserve(shape.size()); for (auto dim : shape) { @@ -78,7 +78,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { + const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( static_cast(ctx.Attr("dtype")), ctx.GetPlace()); @@ -112,17 +112,17 @@ uniform distribution. The random result is in set [min, max]. class UniformRandomOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { auto out_var_name = op_desc.Output("Out").front(); - if (block->FindRecursiveOrCreateVar(out_var_name).GetType() == - framework::proto::VarType::SELECTED_ROWS) { - block->FindRecursiveOrCreateVar(out_var_name) - .SetType(framework::proto::VarType::SELECTED_ROWS); - } else { - block->FindRecursiveOrCreateVar(out_var_name) - .SetType(framework::proto::VarType::LOD_TENSOR); + auto var_data_type = static_cast( + boost::get(op_desc.GetAttr("dtype"))); + + auto out_var = block->FindRecursiveOrCreateVar(out_var_name); + if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) { + out_var.SetType(framework::proto::VarType::LOD_TENSOR); } + out_var.SetDataType(var_data_type); } }; diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 9a8300524d..1ff40a26f2 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -14,6 +14,8 @@ from __future__ import print_function from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr +from .. import core +from ..framework import convert_np_dtype_to_dtype_ __activations_noattr__ = [ 'sigmoid', @@ -58,8 +60,11 @@ _uniform_random_ = generate_layer_fn('uniform_random') def uniform_random(shape, dtype=None, min=None, max=None, seed=None): + locals_var = locals().keys() + if not isinstance(dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(dtype) kwargs = dict() - for name in locals(): + for name in locals_var: val = locals()[name] if val is not None: kwargs[name] = val @@ -78,8 +83,9 @@ _hard_shrink_ = generate_layer_fn('hard_shrink') def hard_shrink(x, threshold=None): + locals_var = locals().keys() kwargs = dict() - for name in locals(): + for name in locals_var: val = locals()[name] if val is not None: kwargs[name] = val @@ -99,12 +105,12 @@ _cum_sum_ = generate_layer_fn('cumsum') def cumsum(x, axis=None, exclusive=None, reverse=None): + locals_var = locals().keys() kwargs = dict() - for name in locals(): + for name in locals_var: val = locals()[name] if val is not None: kwargs[name] = val - return _cum_sum_(**kwargs) @@ -121,8 +127,9 @@ _thresholded_relu_ = generate_layer_fn('thresholded_relu') def thresholded_relu(x, threshold=None): + locals_var = locals().keys() kwargs = dict() - for name in locals(): + for name in locals_var: val = locals()[name] if val is not None: kwargs[name] = val From d852be7c4886678d1fe12a6f37148cf7437d0a82 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sat, 13 Oct 2018 19:25:23 +0800 Subject: [PATCH 15/15] Revert "Make variable::GetMutable robust" --- paddle/fluid/framework/executor.cc | 2 +- paddle/fluid/framework/feed_fetch_method.cc | 3 ++- paddle/fluid/framework/naive_executor.cc | 2 +- paddle/fluid/framework/variable.h | 6 +----- paddle/fluid/framework/variable_test.cc | 11 +++++------ python/paddle/fluid/tests/book/test_word2vec.py | 16 ++++++++++++++-- 6 files changed, 24 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index a070b8efb8..70ec6e90a4 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -66,7 +66,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3e9353f5cf..8e1f93c5eb 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -27,7 +27,8 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, // be created. VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; Variable* g_feed_value = scope->Var(var_name); - auto& feed_inputs = *(g_feed_value->GetMutable()); + auto& feed_inputs = + *(g_feed_value->GetMutable>()); if (index >= feed_inputs.size()) { feed_inputs.resize(index + 1); } diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 2840d503f1..ba10687d65 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 873e1b20a5..067e0c2b83 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -38,12 +38,8 @@ class Variable { template T* GetMutable() { - if (!holder_) { + if (!IsType()) { holder_.reset(new PlaceholderImpl(new T())); - } else { - PADDLE_ENFORCE(IsType(), - "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); } return static_cast(holder_->Ptr()); } diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc index 003dcfd3df..c5c1d215f4 100644 --- a/paddle/fluid/framework/variable_test.cc +++ b/paddle/fluid/framework/variable_test.cc @@ -33,10 +33,9 @@ TEST(Variable, GetMutable) { const Tensor& tt = v->Get(); EXPECT_EQ(1234, tt.content_); - try { - v->GetMutable(); - } catch (std::exception& e) { - return; - } - EXPECT_TRUE(false); + std::string* s = v->GetMutable(); + *s = "hello"; + + const std::string& ss = v->Get(); + EXPECT_EQ("hello", ss); } diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 1f3a230048..9191f0fc20 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -17,6 +17,7 @@ from __future__ import print_function import paddle import paddle.fluid as fluid from paddle.fluid.layers.device import get_places +from paddle.fluid.layers.control_flow import ParallelDo import unittest import os import numpy as np @@ -83,7 +84,18 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): avg_cost, predict_word = __network__( [first_word, second_word, third_word, forth_word, next_word]) else: - raise ValueError('is_parallel=True not implemented') + places = get_places() + pd = ParallelDo(places) + with pd.do(): + avg_cost, predict_word = __network__( + list( + map(pd.read_input, [ + first_word, second_word, third_word, forth_word, + next_word + ]))) + pd.write_output(avg_cost) + + avg_cost = fluid.layers.mean(pd()) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) @@ -250,7 +262,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel): for use_cuda in (False, True): for is_sparse in (False, True): - for is_parallel in (False, ): # TODO(paddle-dev): Add parallel test. + for is_parallel in (False, True): inject_test_method(use_cuda, is_sparse, is_parallel) if __name__ == '__main__':