From 4f6e9e3ac30ed4b3489394b79f0b1dd607432b93 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 19 Dec 2018 10:48:55 +0800 Subject: [PATCH 01/22] teacher student sigmoid loss --- .../teacher_student_sigmoid_loss_op.cc | 256 ++++++++++++++++++ .../teacher_student_sigmoid_loss_op.h | 25 ++ python/paddle/fluid/layers/nn.py | 42 +++ .../test_teacher_student_sigmoid_loss_op.py | 70 +++++ 4 files changed, 393 insertions(+) create mode 100644 paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc create mode 100644 paddle/fluid/operators/teacher_student_sigmoid_loss_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc new file mode 100644 index 0000000000..98eafb9f84 --- /dev/null +++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc @@ -0,0 +1,256 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2UL, + "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(label_dims[1], 1UL, + "The 2nd dimension of " + "Input(Label) should be 1."); + ctx->SetOutputDim("Y", {x_dims[0], 1}); + ctx->ShareLoD("X", /*->*/ "Y"); + } + + protected: + // Explicitly set that the data type of computation kernel of + // teacher_student_sigmoid_loss + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class TeacherStudentSigmoidLossGradientOp + : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0], + "The 1st dimension of Input(X) and Input(Y@Grad) should " + "be equal."); + PADDLE_ENFORCE_EQ(dy_dims[1], 1, + "The 2nd dimension of Input(Y@Grad) should be 1."); + PADDLE_ENFORCE_EQ(label_dims[1], 1, + "When Attr(soft_label) == false, the 2nd dimension of " + "Input(Label) should be 1."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of + // teacher_student_sigmoid_loss + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class TeacherStudentSigmoidLossOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape [N x 1]," + " where N is the batch size and D is the output. " + "This input is a probability computed by the previous operator, " + "which is almost always the result of a softmax operator."); + AddInput("Label", + "(Tensor), the ground truth which is a 2-D tensor. " + "Label is a Tensor with shape [N x 1]. "); + AddOutput("Y", + "(Tensor, default Tensor), a 2-D tensor with shape " + "[N x 1]. The teacher student sigmoid loss."); + AddAttr("soft_max_up_bound", "fp32, default 15.0").SetDefault(15.0); + AddAttr("soft_max_lower_bound", "fp32, default -15.0") + .SetDefault(-15.0); + AddComment(R"DOC( +TeacherStudentSigmoidLoss Operator. +TeacherStudentSigmoidLoss Operator. + +It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that +we add another label(z') to original. + loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x))) + z is click or not + z' is value q of feed_fine + label = {-2, -1, [0, 2]} + when z' is not exist, clk = 0 : label = -2; + when z' is not exist, clk = 1 : label = -1; + when z' is exist , clk = 0 : label = 0 + z'; + when z' is exist , clk = 1 : label = 1 + z'; + +)DOC"); + } +}; + +// template +template +class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()), + "This kernel only runs on CPU."); + + Tensor* y = context.Output("Y"); + const Tensor* x = context.Input("X"); + const Tensor* labels = context.Input("Label"); + T* y_data = y->mutable_data(context.GetPlace()); + const T* x_data = x->data(); + const T* label_data = labels->data(); + int64_t batch_size = x->dims()[0]; + // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + + // log(1 + exp(-abs(x))) + // z is click or not + // z' is value q of feed_fine + // label = {-2, -1, [0, 2]} + // when z' is not exist, clk = 0 : label = -2; + // when z' is not exist, clk = 1 : label = -1; + // when z' is exist , clk = 0 : label = 0 + z'; + // when z' is exist , clk = 1 : label = 1 + z'; + for (int i = 0; i < batch_size; ++i) { + if (label_data[i] < -1.0) { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) + + log(1.0 + exp(-fabs(x_data[i]))); + } else if (label_data[i] < 0.0) { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] + + log(1.0 + exp(-fabs(x_data[i]))); + } else if (label_data[i] < 1.0) { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) + + log(1.0 + exp(-fabs(x_data[i]))) + + (x_data[i] > 0 ? x_data[i] : 0.0) - + x_data[i] * label_data[i] + + log(1.0 + exp(-fabs(x_data[i]))); + } else { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] + + log(1.0 + exp(-fabs(x_data[i]))) + + (x_data[i] > 0 ? x_data[i] : 0.0) - + x_data[i] * (label_data[i] - 1.0) + + log(1.0 + exp(-fabs(x_data[i]))); + } + } + } +}; + +template +class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + const T* x_data = x->data(); + + Tensor* dx = context.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(context.GetPlace()); + + const Tensor* labels = context.Input("Label"); + const T* label_data = labels->data(); + + T soft_max_up_bound = + static_cast(context.Attr("soft_max_up_bound")); + T soft_max_lower_bound = + static_cast(context.Attr("soft_max_lower_bound")); + + int64_t batch_size = x->dims()[0]; + + const framework::Tensor* dOut = + context.Input(framework::GradVarName("Y")); + + const T* dout_data = dOut->data(); + + for (int i = 0; i < batch_size; ++i) { + T sum_val = x_data[i]; + if (sum_val > soft_max_up_bound) { + sum_val = soft_max_up_bound; + } else { + if (sum_val < soft_max_lower_bound) { + sum_val = soft_max_lower_bound; + } + } + + T pred = 1.0 / (1.0 + exp(-sum_val)); + if (label_data[i] < -1.0) { + dx_data[i] = 0.0 - pred; + } else if (label_data[i] < 0.0) { + dx_data[i] = 1.0 - pred; + } else { + dx_data[i] = label_data[i] - 2.0 * pred; + } + if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) { + dx_data[i] = 0; + } + dx_data[i] *= dout_data[i] * -1; + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(teacher_student_sigmoid_loss, + ops::TeacherStudentSigmoidLossOp, + ops::TeacherStudentSigmoidLossOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad, + ops::TeacherStudentSigmoidLossGradientOp); + +REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss, + ops::TeacherStudentSigmoidLossOpKernel, + ops::TeacherStudentSigmoidLossOpKernel); + +REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss_grad, + ops::TeacherStudentSigmoidLossGradOpKernel, + ops::TeacherStudentSigmoidLossGradOpKernel); diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h new file mode 100644 index 0000000000..77b2760e9c --- /dev/null +++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9e6cd1a0ab..68243cf744 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -176,6 +176,7 @@ __all__ = [ 'get_tensor_from_selected_rows', 'lstm', 'psroi_pool', + 'teacher_student_sigmoid_loss', ] kIgnoreIndex = -100 @@ -9184,6 +9185,47 @@ def log_loss(input, label, epsilon=1e-4, name=None): return loss +def teacher_student_sigmoid_loss(input, + label, + soft_max_up_bound=15.0, + soft_max_lower_bound=-15.0): + """ + **Teacher Student Log Loss Layer** + + This layer accepts input predictions and target label and returns the + teacher_student loss. + + .. math:: + loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x))) + + Args: + input (Variable|list): a 2-D tensor with shape [N x 1], where N is the + batch size. This input is a probability computed + by the previous operator. + label (Variable|list): the ground truth which is a 2-D tensor with + shape [N x 1], where N is the batch size. + soft_max_up_bound (float): if input > soft_max_up_bound, will be bound + soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound + + Returns: + Variable: A 2-D tensor with shape [N x 1], the teacher_student_sigmoid_loss. + + Examples: + .. code-block:: python + cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label) + """ + helper = LayerHelper('teacher_student_sigmoid_loss', **locals()) + out = helper.create_variable(dtype=input.dtype) + helper.append_op( + type='teacher_student_sigmoid_loss', + inputs={'X': [input], + 'Label': [label]}, + outputs={'Y': [out]}, + attrs={"soft_max_lower_bound": float(soft_max_lower_bound), \ + "soft_max_up_bound": float(soft_max_up_bound)}) + return out + + def add_position_encoding(input, alpha, beta, name=None): """ **Add Position Encoding Layer** diff --git a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py new file mode 100644 index 0000000000..faa5163b32 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py @@ -0,0 +1,70 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from math import log +from math import exp +from op_test import OpTest +from scipy.special import logit +from scipy.special import expit +import unittest + + +class TestTeacherStudentSigmoidLossOp(OpTest): + """ + Test teacher_student_sigmoid_loss with discrete one-hot labels. + """ + + def setUp(self): + """ + ut + """ + self.op_type = "teacher_student_sigmoid_loss" + batch_size = 16 + num_classes = 1 + self.inputs = { + 'X': logit( + np.random.uniform(0, 1, (batch_size, num_classes)) + .astype("float32")), + 'Label': np.random.uniform(0, 2, (batch_size, num_classes)) + .astype("float32") + } + outs = [] + for index, label in enumerate(self.inputs["Label"]): + x = self.inputs["X"][index] + if label < -1.0: + outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x)))) + elif label < 0.0: + outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x)))) + elif label < 1.0: + outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))) + \ + max(x, 0.0) - x * label + log(1.0 + exp(-abs(x)))) + #print "33 python x:", x, "python label:", label, "term1:", max(x, 0.0) + log(1.0 + exp(-abs(x))), "term2:", max(x, 0.0) - x * label + log(1.0 + exp(-abs(x))) + else: + outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))) + \ + max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x)))) + #print "44 python x:", x, "python label:", label, "term1:", max(x, 0.0) - x + log(1.0 + exp(-abs(x))), "term2:", max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x))) + self.outputs = {'Y': np.array(outs)} + + def test_check_output(self): + """ + ut + """ + self.check_output() + + def test_check_grad(self): + """ + ut + """ + self.check_grad(["X"], "Y", numeric_grad_delta=0.005) From b3cf476de414a3707a00f906422ddb956db7798f Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 19 Dec 2018 11:00:50 +0800 Subject: [PATCH 02/22] teacher student sigmoid loss test=develop --- python/paddle/fluid/layers/nn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 68243cf744..96fa503e77 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9204,7 +9204,7 @@ def teacher_student_sigmoid_loss(input, by the previous operator. label (Variable|list): the ground truth which is a 2-D tensor with shape [N x 1], where N is the batch size. - soft_max_up_bound (float): if input > soft_max_up_bound, will be bound + soft_max_up_bound (float): if input > soft_max_up_bound, will be bound soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound Returns: From a94285869b42afa9581bff565ccfdae18c6cc7dd Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 19 Dec 2018 12:48:04 +0800 Subject: [PATCH 03/22] add API test=develop --- paddle/fluid/API.spec | 1 + paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc | 2 +- paddle/fluid/operators/teacher_student_sigmoid_loss_op.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 5e9901bb87..460ada8a00 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -74,6 +74,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) +paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)) paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc index 98eafb9f84..4b307140c5 100644 --- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc +++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h index 77b2760e9c..f8e64c4d18 100644 --- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h +++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 754a5f8866da4fc3c324211a1ae7cb1746d5648a Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 19 Dec 2018 14:27:25 +0800 Subject: [PATCH 04/22] refine API test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 460ada8a00..33cf1df2a0 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -74,7 +74,6 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) -paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)) paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) @@ -210,6 +209,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)) paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) From 39f4e9273ee07fefcbfab47d92d322dd3d654578 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 19 Dec 2018 19:41:04 +0800 Subject: [PATCH 05/22] data_norm test=develop --- paddle/fluid/API.spec | 1 + paddle/fluid/operators/data_norm_op.cc | 409 +++++++++++++++++++++++++ paddle/fluid/operators/data_norm_op.h | 35 +++ python/paddle/fluid/layers/nn.py | 134 +++++++- 4 files changed, 576 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/operators/data_norm_op.cc create mode 100644 paddle/fluid/operators/data_norm_op.h diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 33cf1df2a0..56b129a928 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -88,6 +88,7 @@ paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'poo paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) +paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, False, None, None, None, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc new file mode 100644 index 0000000000..d5bc25d19c --- /dev/null +++ b/paddle/fluid/operators/data_norm_op.cc @@ -0,0 +1,409 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/data_norm_op.h" +#include +#include "paddle/fluid/framework/data_layout.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +class DataNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSize"), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSum"), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Means"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Scales"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "Input X must have 2 to 5 dimensions."); + + const int64_t C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C); + + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("Means", {C}); + ctx->SetOutputDim("Scales", {C}); + ctx->ShareLoD("X", "Y"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = ctx.Input("X")->type(); + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). + auto dn_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + dn_param_type = framework::proto::VarType::FP64; + } + PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input("BatchSize")->type(), + "BatchSize input should be of float type"); + PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input("BatchSum")->type(), + "BatchSum input should be of float type"); + PADDLE_ENFORCE_EQ(dn_param_type, + ctx.Input("BatchSquareSum")->type(), + "BatchSquareSum input should be of float type"); + + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } +#endif + + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library); + } +}; + +class DataNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + // AddAttr("is_test", "").SetDefault(false); + AddAttr("epsilon", "") + .SetDefault(1e-4) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("data_layout", "").SetDefault("NCHW"); + AddInput("X", "The input tensor"); + AddInput("BatchSize", + "BatchSize is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("BatchSum", + "BatchSum is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("BatchSquareSum", + "The global BatchSquareSum (for training) or " + "estimated BatchSquareSum (for testing)"); + AddOutput("Y", "result after normalization"); + AddOutput("Means", + "Mean of the history data batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("Scales", + "Scales of the history data batch, " + "will apply to output when training") + .AsIntermediate(); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddComment(R"DOC( +Data Normalization. + +Can be used as a normalizer function for data +The required data format for this layer is one of the following: +1. NHWC `[batch, in_height, in_width, in_channels]` +2. NCHW `[batch, in_channels, in_height, in_width]` + +)DOC"); + } +}; + +template +class DataNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + // const bool is_test = ctx.Attr("is_test"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2"); + const int N = x_dims[0]; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + auto *y = ctx.Output("Y"); + auto *mean_out = ctx.Output("Means"); + auto *scales = ctx.Output("Scales"); + + // alloc memory + y->mutable_data(ctx.GetPlace()); + + Eigen::Array inv_std(C); + ConstEigenVectorArrayMap b_size_arr( + ctx.Input("BatchSize")->data(), C); + ConstEigenVectorArrayMap b_sum_arr( + ctx.Input("BatchSum")->data(), C); + ConstEigenVectorArrayMap b_square_sum_arr( + ctx.Input("BatchSquareSum")->data(), C); + EigenVectorArrayMap means_arr(mean_out->mutable_data(ctx.GetPlace()), + C); + EigenVectorArrayMap scales_arr(scales->mutable_data(ctx.GetPlace()), + C); + means_arr = b_sum_arr / b_size_arr; + scales_arr = (b_size_arr / b_square_sum_arr).sqrt(); + + switch (data_layout) { + case DataLayout::kNCHW: // because it's two dimensions, so make no + // difference + case DataLayout::kNHWC: { + EigenArrayMap(y->mutable_data(ctx.GetPlace()), C, N) = + (ConstEigenArrayMap(x->data(), C, N).colwise() - means_arr) + .colwise() * + scales_arr; + break; + } + default: + PADDLE_THROW("Unknown storage order: %d", data_layout); + } + } +}; + +class DataNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // check input + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSize"), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSum"), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), ""); + PADDLE_ENFORCE(ctx->HasInput("Means"), ""); + PADDLE_ENFORCE(ctx->HasInput("Scales"), ""); + + // check output + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSize")), ""); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSum")), ""); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSquareSum")), + ""); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->SetOutputDim(framework::GradVarName("BatchSize"), {C}); + ctx->SetOutputDim(framework::GradVarName("BatchSum"), {C}); + ctx->SetOutputDim(framework::GradVarName("BatchSquareSum"), {C}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } +#endif + + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout, library); + } +}; + +template +class DataNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *batch_size = ctx.Input("BatchSize"); + const auto *batch_sum = ctx.Input("BatchSum"); + const auto *batch_square_sum = ctx.Input("BatchSquareSum"); + const auto *scales = ctx.Input("Scales"); + const auto *means = ctx.Input("Means"); + + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2"); + const int N = x_dims[0]; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_batch_size = + ctx.Output(framework::GradVarName("BatchSize")); + auto *d_batch_sum = ctx.Output(framework::GradVarName("BatchSum")); + auto *d_batch_square_sum = + ctx.Output(framework::GradVarName("BatchSquareSum")); + + EigenVectorArrayMap d_batch_size_arr( + d_batch_size->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap d_batch_sum_arr( + d_batch_sum->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap d_batch_square_sum_arr( + d_batch_square_sum->mutable_data(ctx.GetPlace()), C); + + d_batch_size_arr.setZero(); + d_batch_sum_arr.setZero(); + d_batch_square_sum_arr.setZero(); + + const float epsilon = ctx.Attr("epsilon"); + switch ( + data_layout) { // because it's two dimensions, so make no difference + case DataLayout::kNCHW: + case DataLayout::kNHWC: { + ConstEigenVectorArrayMap scales_arr(scales->data(), C); + ConstEigenVectorArrayMap means_arr(means->data(), C); + ConstEigenArrayMap x_arr(x->data(), C, N); + ConstEigenArrayMap d_y_arr(d_y->data(), C, N); + EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, N); + d_x_arr.setZero(); + for (int nc = 0; nc < N; ++nc) { + d_x_arr.col(nc) = d_y_arr.col(nc) * scales_arr; + } + + // calculate data sum and squre sum + ConstEigenVectorArrayMap batch_size_arr(batch_size->data(), C); + ConstEigenVectorArrayMap batch_sum_arr(batch_sum->data(), C); + ConstEigenVectorArrayMap batch_square_sum_arr( + batch_square_sum->data(), C); + Eigen::Array sample_sum(C); + Eigen::Array sample_square_sum(C); + // calculate data sample sum and square sum + sample_sum.setZero(); + sample_square_sum.setZero(); + for (int nc = 0; nc < N; ++nc) { + sample_sum += x_arr.col(nc); + sample_square_sum += (x_arr.col(nc) - means_arr).square(); + } + // calculate gradient + d_batch_size_arr.setConstant(N); + d_batch_sum_arr = sample_sum; + d_batch_square_sum_arr = sample_square_sum + d_batch_size_arr * epsilon; + break; + } + default: + PADDLE_THROW("Unknown storage order: %s", data_layout_str); + } + } +}; + +class DataNormGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); + op->SetType("data_norm_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + + op->SetInput("BatchSize", Input("BatchSize")); + op->SetInput("BatchSum", Input("BatchSum")); + op->SetInput("BatchSquareSum", Input("BatchSquareSum")); + op->SetInput("Scales", Output("Scales")); + op->SetInput("Means", Output("Means")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("BatchSize"), InputGrad("BatchSize")); + op->SetOutput(framework::GradVarName("BatchSum"), InputGrad("BatchSum")); + op->SetOutput(framework::GradVarName("BatchSquareSum"), + InputGrad("BatchSquareSum")); + + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(data_norm, ops::DataNormOp, ops::DataNormOpMaker, + ops::DataNormGradMaker); +REGISTER_OPERATOR(data_norm_grad, ops::DataNormGradOp); + +REGISTER_OP_CPU_KERNEL( + data_norm, ops::DataNormKernel, + ops::DataNormKernel); +REGISTER_OP_CPU_KERNEL( + data_norm_grad, + ops::DataNormGradKernel, + ops::DataNormGradKernel); diff --git a/paddle/fluid/operators/data_norm_op.h b/paddle/fluid/operators/data_norm_op.h new file mode 100644 index 0000000000..63451214bc --- /dev/null +++ b/paddle/fluid/operators/data_norm_op.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class DataNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +template +class DataNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 341f36763f..269d0086ef 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -55,6 +55,7 @@ __all__ = [ 'adaptive_pool2d', 'adaptive_pool3d', 'batch_norm', + 'data_norm', 'beam_search_decode', 'conv2d_transpose', 'conv3d_transpose', @@ -2886,6 +2887,133 @@ def batch_norm(input, return helper.append_activation(batch_norm_out) +def data_norm(input, + act=None, + epsilon=1e-05, + param_attr=None, + data_layout='NCHW', + in_place=False, + use_mkldnn=False, + name=None, + moving_mean_name=None, + moving_variance_name=None, + do_model_average_for_mean_and_var=False): + """ + **Data Normalization Layer** + + Can be used as a normalizer function for conv2d and fully_connected operations. + The required data format for this layer is one of the following: + + 1. NHWC `[batch, in_height, in_width, in_channels]` + + 2. NCHW `[batch, in_channels, in_height, in_width]` + + :math:`input` is the input features over a mini-batch. + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ + \ mini-batch\ mean \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + Args: + input(variable): The input variable which is a LoDTensor. + act(string, Default None): Activation type, linear|relu|prelu|... + epsilon(float, Default 1e-05): + param_attr(ParamAttr): The parameter attribute for Parameter `scale`. + data_layout(string, default NCHW): NCHW|NHWC + in_place(bool, Default False): Make the input and output of batch norm reuse memory. + use_mkldnn(bool, Default false): ${use_mkldnn_comment} + name(string, Default None): A name for this layer(optional). If set None, the layer + will be named automatically. + moving_mean_name(string, Default None): The name of moving_mean which store the global Mean. + moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance. + do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not. + + Returns: + Variable: A tensor variable which is the result after applying data normalization on the input. + + Examples: + + .. code-block:: python + + data = fluid.layers.data(input=x, size=200, param_attr='fc1.w') + hidden2 = fluid.layers.data_norm(input=hidden1) + """ + helper = LayerHelper('data_norm', **locals()) + dtype = helper.input_dtype() + + input_shape = input.shape + if data_layout == 'NCHW': + channel_num = input_shape[1] + else: + if data_layout == 'NHWC': + channel_num = input_shape[-1] + else: + raise ValueError("unsupported data layout:" + data_layout) + + param_shape = [channel_num] + + batch_size_default = 1e4 + batch_sum_default = 0.0 + batch_square_sum_default = 1e4 + + if param_attr and isinstance(param_attr, dict): + batch_size_default = param_attr.get("batch_size", 1e4) + batch_sum_default = param_attr.get("batch_sum", 0.0) + batch_square_sum_default = param_attr.get("batch_square", 1e4) + + # create parameter + batch_size = helper.create_parameter( + attr=ParamAttr( + name=name + '.batch_size', + initializer=Constant(value=float(batch_size_default)), + trainable=True), + shape=param_shape, + dtype=input.dtype) + + batch_sum = helper.create_parameter( + attr=ParamAttr( + name=name + '.batch_sum', + initializer=Constant(value=float(batch_sum_default)), + trainable=True), + shape=param_shape, + dtype=input.dtype) + + batch_square_sum = helper.create_parameter( + attr=ParamAttr( + name=name + '.batch_square_sum', + initializer=Constant(value=float(batch_square_sum_default)), + trainable=True), + shape=param_shape, + dtype=input.dtype) + + means = helper.create_variable(dtype=dtype, stop_gradient=True) + scales = helper.create_variable(dtype=dtype, stop_gradient=True) + + data_norm_out = input if in_place else helper.create_variable(dtype=dtype) + + helper.append_op( + type="data_norm", + inputs={ + "X": input, + "BatchSize": batch_size, + "BatchSum": batch_sum, + "BatchSquareSum": batch_square_sum + }, + outputs={"Y": data_norm_out, + "Means": means, + "Scales": scales}, + attrs={"epsilon": epsilon, + "use_mkldnn": use_mkldnn}) + + return helper.append_activation(data_norm_out) + + @templatedoc() def layer_norm(input, scale=True, @@ -3054,9 +3182,9 @@ def group_norm(input, inputs['Bias'] = bias # create output - mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) - variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) - group_norm_out = helper.create_tmp_variable(dtype) + mean_out = helper.create_variable(dtype=dtype, stop_gradient=True) + variance_out = helper.create_variable(dtype=dtype, stop_gradient=True) + group_norm_out = helper.create_variable(dtype) helper.append_op( type="group_norm", From f18e8a7a5e9d9964f4fe1c60f53d32368b126d28 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 20 Dec 2018 10:38:09 +0800 Subject: [PATCH 06/22] remove some comments & refine doc & put template class in .h test=develop --- .../teacher_student_sigmoid_loss_op.cc | 110 ++---------------- .../teacher_student_sigmoid_loss_op.h | 93 +++++++++++++++ .../test_teacher_student_sigmoid_loss_op.py | 11 -- 3 files changed, 101 insertions(+), 113 deletions(-) diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc index 4b307140c5..c8ee13875c 100644 --- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc +++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc @@ -115,18 +115,22 @@ class TeacherStudentSigmoidLossOpMaker AddOutput("Y", "(Tensor, default Tensor), a 2-D tensor with shape " "[N x 1]. The teacher student sigmoid loss."); - AddAttr("soft_max_up_bound", "fp32, default 15.0").SetDefault(15.0); - AddAttr("soft_max_lower_bound", "fp32, default -15.0") + AddAttr( + "soft_max_up_bound", + "fp32, if input > soft_max_up_bound, will be bound, default 15.0") + .SetDefault(15.0); + AddAttr( + "soft_max_lower_bound", + "fp32, if input < soft_max_lower_bound, will be bound, default -15.0") .SetDefault(-15.0); AddComment(R"DOC( TeacherStudentSigmoidLoss Operator. -TeacherStudentSigmoidLoss Operator. It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that we add another label(z') to original. loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x))) z is click or not - z' is value q of feed_fine + z' is teacher value label = {-2, -1, [0, 2]} when z' is not exist, clk = 0 : label = -2; when z' is not exist, clk = 1 : label = -1; @@ -137,104 +141,6 @@ we add another label(z') to original. } }; -// template -template -class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()), - "This kernel only runs on CPU."); - - Tensor* y = context.Output("Y"); - const Tensor* x = context.Input("X"); - const Tensor* labels = context.Input("Label"); - T* y_data = y->mutable_data(context.GetPlace()); - const T* x_data = x->data(); - const T* label_data = labels->data(); - int64_t batch_size = x->dims()[0]; - // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + - // log(1 + exp(-abs(x))) - // z is click or not - // z' is value q of feed_fine - // label = {-2, -1, [0, 2]} - // when z' is not exist, clk = 0 : label = -2; - // when z' is not exist, clk = 1 : label = -1; - // when z' is exist , clk = 0 : label = 0 + z'; - // when z' is exist , clk = 1 : label = 1 + z'; - for (int i = 0; i < batch_size; ++i) { - if (label_data[i] < -1.0) { - y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) + - log(1.0 + exp(-fabs(x_data[i]))); - } else if (label_data[i] < 0.0) { - y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] + - log(1.0 + exp(-fabs(x_data[i]))); - } else if (label_data[i] < 1.0) { - y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) + - log(1.0 + exp(-fabs(x_data[i]))) + - (x_data[i] > 0 ? x_data[i] : 0.0) - - x_data[i] * label_data[i] + - log(1.0 + exp(-fabs(x_data[i]))); - } else { - y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] + - log(1.0 + exp(-fabs(x_data[i]))) + - (x_data[i] > 0 ? x_data[i] : 0.0) - - x_data[i] * (label_data[i] - 1.0) + - log(1.0 + exp(-fabs(x_data[i]))); - } - } - } -}; - -template -class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - const T* x_data = x->data(); - - Tensor* dx = context.Output(framework::GradVarName("X")); - T* dx_data = dx->mutable_data(context.GetPlace()); - - const Tensor* labels = context.Input("Label"); - const T* label_data = labels->data(); - - T soft_max_up_bound = - static_cast(context.Attr("soft_max_up_bound")); - T soft_max_lower_bound = - static_cast(context.Attr("soft_max_lower_bound")); - - int64_t batch_size = x->dims()[0]; - - const framework::Tensor* dOut = - context.Input(framework::GradVarName("Y")); - - const T* dout_data = dOut->data(); - - for (int i = 0; i < batch_size; ++i) { - T sum_val = x_data[i]; - if (sum_val > soft_max_up_bound) { - sum_val = soft_max_up_bound; - } else { - if (sum_val < soft_max_lower_bound) { - sum_val = soft_max_lower_bound; - } - } - - T pred = 1.0 / (1.0 + exp(-sum_val)); - if (label_data[i] < -1.0) { - dx_data[i] = 0.0 - pred; - } else if (label_data[i] < 0.0) { - dx_data[i] = 1.0 - pred; - } else { - dx_data[i] = label_data[i] - 2.0 * pred; - } - if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) { - dx_data[i] = 0; - } - dx_data[i] *= dout_data[i] * -1; - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h index f8e64c4d18..41d2662ae2 100644 --- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h +++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h @@ -20,6 +20,99 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + Tensor* y = context.Output("Y"); + const Tensor* x = context.Input("X"); + const Tensor* labels = context.Input("Label"); + T* y_data = y->mutable_data(context.GetPlace()); + const T* x_data = x->data(); + const T* label_data = labels->data(); + int64_t batch_size = x->dims()[0]; + // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + + // log(1 + exp(-abs(x))) + // z is click or not + // z' is value q of feed_fine + // label = {-2, -1, [0, 2]} + // when z' is not exist, clk = 0 : label = -2; + // when z' is not exist, clk = 1 : label = -1; + // when z' is exist , clk = 0 : label = 0 + z'; + // when z' is exist , clk = 1 : label = 1 + z'; + for (int i = 0; i < batch_size; ++i) { + if (label_data[i] < -1.0) { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) + + log(1.0 + exp(-fabs(x_data[i]))); + } else if (label_data[i] < 0.0) { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] + + log(1.0 + exp(-fabs(x_data[i]))); + } else if (label_data[i] < 1.0) { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) + + log(1.0 + exp(-fabs(x_data[i]))) + + (x_data[i] > 0 ? x_data[i] : 0.0) - + x_data[i] * label_data[i] + + log(1.0 + exp(-fabs(x_data[i]))); + } else { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] + + log(1.0 + exp(-fabs(x_data[i]))) + + (x_data[i] > 0 ? x_data[i] : 0.0) - + x_data[i] * (label_data[i] - 1.0) + + log(1.0 + exp(-fabs(x_data[i]))); + } + } + } +}; +template +class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + const T* x_data = x->data(); + + Tensor* dx = context.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(context.GetPlace()); + + const Tensor* labels = context.Input("Label"); + const T* label_data = labels->data(); + + T soft_max_up_bound = + static_cast(context.Attr("soft_max_up_bound")); + T soft_max_lower_bound = + static_cast(context.Attr("soft_max_lower_bound")); + + int64_t batch_size = x->dims()[0]; + + const framework::Tensor* dOut = + context.Input(framework::GradVarName("Y")); + + const T* dout_data = dOut->data(); + + for (int i = 0; i < batch_size; ++i) { + T sum_val = x_data[i]; + if (sum_val > soft_max_up_bound) { + sum_val = soft_max_up_bound; + } else { + if (sum_val < soft_max_lower_bound) { + sum_val = soft_max_lower_bound; + } + } + + T pred = 1.0 / (1.0 + exp(-sum_val)); + if (label_data[i] < -1.0) { + dx_data[i] = 0.0 - pred; + } else if (label_data[i] < 0.0) { + dx_data[i] = 1.0 - pred; + } else { + dx_data[i] = label_data[i] - 2.0 * pred; + } + if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) { + dx_data[i] = 0; + } + dx_data[i] *= dout_data[i] * -1; + } + } +}; } // namespace operators } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py index faa5163b32..26bf0fd883 100644 --- a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py @@ -27,9 +27,6 @@ class TestTeacherStudentSigmoidLossOp(OpTest): """ def setUp(self): - """ - ut - """ self.op_type = "teacher_student_sigmoid_loss" batch_size = 16 num_classes = 1 @@ -50,21 +47,13 @@ class TestTeacherStudentSigmoidLossOp(OpTest): elif label < 1.0: outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))) + \ max(x, 0.0) - x * label + log(1.0 + exp(-abs(x)))) - #print "33 python x:", x, "python label:", label, "term1:", max(x, 0.0) + log(1.0 + exp(-abs(x))), "term2:", max(x, 0.0) - x * label + log(1.0 + exp(-abs(x))) else: outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))) + \ max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x)))) - #print "44 python x:", x, "python label:", label, "term1:", max(x, 0.0) - x + log(1.0 + exp(-abs(x))), "term2:", max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x))) self.outputs = {'Y': np.array(outs)} def test_check_output(self): - """ - ut - """ self.check_output() def test_check_grad(self): - """ - ut - """ self.check_grad(["X"], "Y", numeric_grad_delta=0.005) From 9ae50dd07d4c912e9cb78dfc5703cba808684343 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 3 Jan 2019 16:22:05 +0800 Subject: [PATCH 07/22] fix gpu buils issue on windows test=develop --- cmake/generic.cmake | 3 +++ paddle/fluid/platform/cuda_helper_test.cu | 1 + 2 files changed, 4 insertions(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c6fe2e970d..9419ffbad0 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -447,6 +447,9 @@ function(nv_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) + if(WIN32) + list(APPEND nv_test_DEPS shlwapi) + endif(WIN32) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) common_link(${TARGET_NAME}) diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu index 466bf90c63..5b605fdc96 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -15,6 +15,7 @@ #include #include #include +#include #include #define PADDLE_CUDA_FP16 From 25523bb8e67496e8f761f6da1351c034e1980759 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 3 Jan 2019 16:46:56 +0800 Subject: [PATCH 08/22] test=develop --- cmake/generic.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 9419ffbad0..49e8dcb70a 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -447,10 +447,10 @@ function(nv_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) if(WIN32) - list(APPEND nv_test_DEPS shlwapi) + target_link_libraries(${TARGET_NAME} shlwapi) endif(WIN32) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) From fd4f4d0e5f5eb7eccfaf2a1234e66d612464a94b Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 3 Jan 2019 19:52:24 +0800 Subject: [PATCH 09/22] fix build issue test=develop --- paddle/fluid/platform/cuda_helper_test.cu | 2 +- paddle/fluid/platform/float16_test.cu | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu index 5b605fdc96..ff49b92ff5 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #define PADDLE_CUDA_FP16 diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index b1b51d804e..14cad927f0 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -271,11 +271,13 @@ TEST(float16, isinf) { float16 b = float16(INFINITY); // underflow to 0 float16 native_a(5e-40f); - // overflow to inf - float16 native_b(5e40f); EXPECT_EQ(std::isinf(a), true); EXPECT_EQ(std::isinf(b), true); +#ifndef _WIN32 + // overflow to inf + float16 native_b(5e40f); EXPECT_EQ(std::isinf(native_b), true); +#endif EXPECT_EQ(native_a, float16(0)); } From 0d0bc61248147d1f382a76e19520750ce4e63eb0 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 7 Jan 2019 18:42:42 +0800 Subject: [PATCH 10/22] update api test=develop --- paddle/fluid/imperative/layer.h | 5 +++- paddle/fluid/pybind/imperative.h | 8 ++++--- paddle/fluid/pybind/pybind.cc | 7 ++++-- python/paddle/fluid/imperative/layers.py | 23 ++++++++++++++++--- python/paddle/fluid/imperative/nn.py | 6 ++--- .../fluid/tests/unittests/test_imperative.py | 6 ++--- .../unittests/test_imperative_optimizer.py | 4 ++-- 7 files changed, 42 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 2abda933cf..377ac3e1c5 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -153,7 +153,10 @@ class Layer { return vars; } - virtual void Backward() { LOG(ERROR) << "To support customize"; } + virtual std::vector Backward(const std::vector& inputs) { + std::vector vars; + return vars; + } }; } // namespace imperative diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index 7a9d3a01ea..ef0d643954 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -22,7 +22,7 @@ limitations under the License. */ namespace paddle { namespace pybind { -class PyLayer : public imperative::Layer { +class Layer : public imperative::Layer { public: using imperative::Layer::Layer; // Inherit constructors @@ -32,8 +32,10 @@ class PyLayer : public imperative::Layer { inputs); // NOLINT } - void Backward() override { - PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT + std::vector Backward( + const std::vector& inputs) override { + PYBIND11_OVERLOAD(std::vector, Layer, Backward, + inputs); // NOLINT } }; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3b81d59ad9..6e3c52da89 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -170,14 +170,17 @@ PYBIND11_MODULE(core, m) { }, py::return_value_policy::reference); - py::class_ layer(m, "Layer"); + py::class_ layer(m, "Layer"); layer.def(py::init<>()) .def("forward", [](imperative::Layer &self, const std::vector &inputs) { return self.Forward(inputs); }) - .def("backward", &imperative::Layer::Backward); + .def("backward", [](imperative::Layer &self, + const std::vector &inputs) { + return self.Backward(inputs); + }); BindTracer(&m); py::class_(m, "Tensor", py::buffer_protocol()) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index d78d61eb3f..1ebf79e052 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -20,10 +20,12 @@ from paddle.fluid import core from paddle.fluid import framework from paddle.fluid.imperative import base -__all__ = ['PyLayer'] +__all__ = ['Layer'] -class PyLayer(core.Layer): +class Layer(core.Layer): + """Layers composed of operators.""" + def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None): self._once_built = False self._dtype = dtype @@ -37,8 +39,23 @@ class PyLayer(core.Layer): self._once_built = True outputs = self.forward(*inputs) - return outputs def forward(self, *inputs): raise NotImplementedError + + def backward(self, *inputs): + raise ValueError("Layer shouldn't implement backward") + + +class PyLayer(core.Layer): + """Layers composed of user-defined python codes.""" + + def __call__(self, *inputs): + pass + + def forward(self, *inputs): + raise NotImplementedError + + def backward(self, *inputs): + raise NotImplementedError diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 4f30417e99..8754e5d4d0 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -30,7 +30,7 @@ __all__ = [ ] -class Conv2D(layers.PyLayer): +class Conv2D(layers.Layer): def __init__(self, num_channels, num_filters, @@ -143,7 +143,7 @@ class Conv2D(layers.PyLayer): return self._helper.append_activation(pre_act) -class Pool2D(layers.PyLayer): +class Pool2D(layers.Layer): def __init__(self, pool_size=-1, pool_type="max", @@ -205,7 +205,7 @@ class Pool2D(layers.PyLayer): return pool_out -class FC(layers.PyLayer): +class FC(layers.Layer): def __init__(self, size, param_attr=None, diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 1dc13ec74e..44005411d1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -22,7 +22,7 @@ from paddle.fluid.imperative.nn import FC from test_imperative_base import new_program_scope -class MyLayer(fluid.imperative.PyLayer): +class MyLayer(fluid.imperative.Layer): def __init__(self): super(MyLayer, self).__init__() @@ -34,7 +34,7 @@ class MyLayer(fluid.imperative.PyLayer): return [x] -class MLP(fluid.imperative.PyLayer): +class MLP(fluid.imperative.Layer): def __init__(self): super(MLP, self).__init__() self._fc1 = FC(3, @@ -56,7 +56,7 @@ class TestImperative(unittest.TestCase): with fluid.imperative.guard(): cl = core.Layer() cl.forward([]) - l = fluid.imperative.PyLayer() + l = fluid.imperative.Layer() self.assertRaises(NotImplementedError, l.forward, []) def test_layer_in_out(self): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 5d97edf876..0549f50fe2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -26,7 +26,7 @@ from paddle.fluid.imperative.base import to_variable from test_imperative_base import new_program_scope -class SimpleImgConvPool(fluid.imperative.PyLayer): +class SimpleImgConvPool(fluid.imperative.Layer): def __init__(self, num_channels, num_filters, @@ -72,7 +72,7 @@ class SimpleImgConvPool(fluid.imperative.PyLayer): return x -class MNIST(fluid.imperative.PyLayer): +class MNIST(fluid.imperative.Layer): def __init__(self, param_attr=None, bias_attr=None): super(MNIST, self).__init__() From b629133375728fcb0be71c2d6fcf7c9d83a2b391 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 8 Jan 2019 10:52:03 +0800 Subject: [PATCH 11/22] checkpoint runnable PyLayer test=develop --- paddle/fluid/imperative/layer.h | 49 +++++++++++++++++-- paddle/fluid/pybind/imperative.h | 6 --- paddle/fluid/pybind/pybind.cc | 21 +++++--- python/paddle/fluid/imperative/layers.py | 22 ++++++--- .../fluid/tests/unittests/test_imperative.py | 26 ++++++++++ 5 files changed, 101 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 377ac3e1c5..d76512af04 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -17,6 +17,9 @@ #include #include #include +#include "pybind11/pybind11.h" + +#include "Python.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_desc.h" @@ -25,6 +28,8 @@ namespace paddle { namespace imperative { +namespace py = ::pybind11; + class PreparedOp { public: PreparedOp(const framework::OperatorBase& op, @@ -152,10 +157,48 @@ class Layer { std::vector vars; return vars; } +}; - virtual std::vector Backward(const std::vector& inputs) { - std::vector vars; - return vars; +static void CallPythonFunc(py::object* callable, + const std::vector& ins, + std::vector* outs) { + py::gil_scoped_acquire guard; + py::tuple in_args(ins.size()); + for (size_t i = 0; i < ins.size(); ++i) { + in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr); + } + + auto ret = (*callable)(in_args); + auto ret_tuple = py::cast(ret); + size_t ret_num = py::len(ret_tuple); + for (size_t i = 0; i < ret_num; ++i) { + try { + auto* py_out_tensor = py::cast(ret_tuple[i]); + PADDLE_ENFORCE_NOT_NULL(py_out_tensor, + "Output tensor %d should not be nullptr", i); + outs->push_back(py_out_tensor); + } catch (py::cast_error&) { + PADDLE_THROW("The %d-th output must be LoDTensor", i); + } + } +} + +class PyLayer { + public: + virtual ~PyLayer() {} + + static std::vector Apply(py::object* callable, + const std::vector& inputs) { + std::vector outputs; + std::vector tensor_inputs; + std::vector tensor_outputs; + + for (const VarBase& in : inputs) { + tensor_inputs.push_back(in.var_->Get()); + } + + CallPythonFunc(callable, tensor_inputs, &tensor_outputs); + return outputs; } }; diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index ef0d643954..f947b743f9 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -31,12 +31,6 @@ class Layer : public imperative::Layer { PYBIND11_OVERLOAD(std::vector, Layer, Forward, inputs); // NOLINT } - - std::vector Backward( - const std::vector& inputs) override { - PYBIND11_OVERLOAD(std::vector, Layer, Backward, - inputs); // NOLINT - } }; class PyOpBase : public imperative::OpBase { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6e3c52da89..d065818bc8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -172,15 +172,20 @@ PYBIND11_MODULE(core, m) { py::class_ layer(m, "Layer"); layer.def(py::init<>()) - .def("forward", - [](imperative::Layer &self, - const std::vector &inputs) { - return self.Forward(inputs); - }) - .def("backward", [](imperative::Layer &self, - const std::vector &inputs) { - return self.Backward(inputs); + .def("forward", [](imperative::Layer &self, + const std::vector &inputs) { + return self.Forward(inputs); }); + + py::class_(m, "PyLayer") + .def(py::init<>()) + .def_static("apply", + [](py::object *callable, + const std::vector &inputs) + -> std::vector { + return imperative::PyLayer::Apply(callable, inputs); + }); + BindTracer(&m); py::class_(m, "Tensor", py::buffer_protocol()) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 1ebf79e052..7696427366 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -20,7 +20,7 @@ from paddle.fluid import core from paddle.fluid import framework from paddle.fluid.imperative import base -__all__ = ['Layer'] +__all__ = ['Layer', 'PyLayer'] class Layer(core.Layer): @@ -48,14 +48,24 @@ class Layer(core.Layer): raise ValueError("Layer shouldn't implement backward") -class PyLayer(core.Layer): +# TODO(panyx0718): Inherit from C++ base class. +class PyLayer(core.PyLayer): """Layers composed of user-defined python codes.""" - def __call__(self, *inputs): - pass + def __init__(self): + super(PyLayer, self).__init__() - def forward(self, *inputs): + @staticmethod + def forward(inputs): raise NotImplementedError - def backward(self, *inputs): + @staticmethod + def backward(inputs): raise NotImplementedError + + @classmethod + def __call__(cls, inputs): + inputs = map(base.to_variable, inputs) + inputs = [x._ivar for x in inputs] + sys.stderr.write('%s\n' % inputs) + return core.PyLayer.apply(cls.forward, inputs) diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 44005411d1..ae99fb82e3 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -15,6 +15,7 @@ import contextlib import unittest import numpy as np +import sys import paddle.fluid as fluid from paddle.fluid import core @@ -34,6 +35,24 @@ class MyLayer(fluid.imperative.Layer): return [x] +class MyPyLayer(fluid.imperative.PyLayer): + def __init__(self): + super(MyPyLayer, self).__init__() + + @staticmethod + def forward(inputs): + sys.stderr.write('before forward\n') + ret = np.tanh(inputs[0]) + sys.stderr.write('after forward: %s\n' % ret) + tensor = core.LoDTensor() + tensor.set(ret, core.CPUPlace()) + return tuple([tensor]) + + @staticmethod + def backward(douts, outs): + return np.array(douts[0]) * (1 - np.square(np.array(outs[0]))) + + class MLP(fluid.imperative.Layer): def __init__(self): super(MLP, self).__init__() @@ -59,6 +78,13 @@ class TestImperative(unittest.TestCase): l = fluid.imperative.Layer() self.assertRaises(NotImplementedError, l.forward, []) + def test_pylayer(self): + with fluid.imperative.guard(): + my_py_layer = MyPyLayer() + out = my_py_layer([np.ones([2, 2], np.float32)]) + sys.stderr.write('%s\n' % np.array(out)) + # out.backward() + def test_layer_in_out(self): np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) with fluid.imperative.guard(): From d54133ea85b586b7ddf337e9af08145c7eba5949 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 8 Jan 2019 11:35:39 +0800 Subject: [PATCH 12/22] not include the numeric under linux test=develop --- paddle/fluid/platform/cuda_helper_test.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu index ff49b92ff5..9e3025bf30 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -15,7 +15,9 @@ #include #include #include +#ifdef _WIN32 #include +#endif #include #define PADDLE_CUDA_FP16 From 11d4d39cd7ce5dd33df94ed65854eb357cad2400 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 8 Jan 2019 13:16:45 +0800 Subject: [PATCH 13/22] forward working test=develop --- paddle/fluid/imperative/layer.h | 21 +++++++++++-------- paddle/fluid/pybind/pybind.cc | 5 +++-- python/paddle/fluid/framework.py | 5 ++++- python/paddle/fluid/imperative/layers.py | 16 ++++++++++++-- .../fluid/tests/unittests/test_imperative.py | 4 ++-- 5 files changed, 35 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index d76512af04..a0eee357c3 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -161,13 +161,14 @@ class Layer { static void CallPythonFunc(py::object* callable, const std::vector& ins, - std::vector* outs) { + std::vector* outs) { py::gil_scoped_acquire guard; py::tuple in_args(ins.size()); for (size_t i = 0; i < ins.size(); ++i) { in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr); } + // TODO(panyx0718): Who owns the returned LoDTensor. auto ret = (*callable)(in_args); auto ret_tuple = py::cast(ret); size_t ret_num = py::len(ret_tuple); @@ -176,7 +177,11 @@ static void CallPythonFunc(py::object* callable, auto* py_out_tensor = py::cast(ret_tuple[i]); PADDLE_ENFORCE_NOT_NULL(py_out_tensor, "Output tensor %d should not be nullptr", i); - outs->push_back(py_out_tensor); + VarBase* var = new VarBase(); + auto* tensor = var->var_->GetMutable(); + tensor->ShareDataWith(*py_out_tensor); + tensor->set_lod(py_out_tensor->lod()); + outs->push_back(var); } catch (py::cast_error&) { PADDLE_THROW("The %d-th output must be LoDTensor", i); } @@ -187,18 +192,16 @@ class PyLayer { public: virtual ~PyLayer() {} - static std::vector Apply(py::object* callable, - const std::vector& inputs) { - std::vector outputs; + static std::vector Apply(py::object* callable, + const std::vector& inputs) { std::vector tensor_inputs; - std::vector tensor_outputs; + std::vector ret; for (const VarBase& in : inputs) { tensor_inputs.push_back(in.var_->Get()); } - - CallPythonFunc(callable, tensor_inputs, &tensor_outputs); - return outputs; + CallPythonFunc(callable, tensor_inputs, &ret); + return ret; } }; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d065818bc8..684b931ee8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -182,9 +182,10 @@ PYBIND11_MODULE(core, m) { .def_static("apply", [](py::object *callable, const std::vector &inputs) - -> std::vector { + -> std::vector { return imperative::PyLayer::Apply(callable, inputs); - }); + }, + py::return_value_policy::take_ownership); BindTracer(&m); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 70767c962f..8e18dffac3 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -372,7 +372,10 @@ class Variable(object): self.stop_gradient = stop_gradient self.is_data = is_data if _in_imperative_mode(): - self._ivar = core.VarBase() + if 'ivar' in kwargs: + self._ivar = kwargs['ivar'] + else: + self._ivar = core.VarBase() self._ivar.desc = self.desc self._ivar.stop_gradient = stop_gradient diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 7696427366..06b6d7ac06 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -67,5 +67,17 @@ class PyLayer(core.PyLayer): def __call__(cls, inputs): inputs = map(base.to_variable, inputs) inputs = [x._ivar for x in inputs] - sys.stderr.write('%s\n' % inputs) - return core.PyLayer.apply(cls.forward, inputs) + ivars = core.PyLayer.apply(cls.forward, inputs) + ret = [] + for ivar in ivars: + tensor = ivar.value.get_tensor() + block = framework.default_main_program().current_block() + py_var = framework.Variable( + block, + type=core.VarDesc.VarType.LOD_TENSOR, + name=None, + shape=tensor.shape(), + dtype=tensor._dtype(), + ivar=ivar) + ret.append(py_var) + return ret diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index ae99fb82e3..133e1e65c7 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -81,8 +81,8 @@ class TestImperative(unittest.TestCase): def test_pylayer(self): with fluid.imperative.guard(): my_py_layer = MyPyLayer() - out = my_py_layer([np.ones([2, 2], np.float32)]) - sys.stderr.write('%s\n' % np.array(out)) + outs = my_py_layer([np.ones([2, 2], np.float32)]) + sys.stderr.write('%s\n' % outs[0]._numpy()) # out.backward() def test_layer_in_out(self): From 2349acea48f76cc527d9a47ff6a0b5da5e77aa2f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 8 Jan 2019 16:41:20 +0800 Subject: [PATCH 14/22] checkpoint test=develop --- paddle/fluid/imperative/layer.cc | 19 +++++++++++++++++ paddle/fluid/imperative/layer.h | 27 ++++++++++++------------ paddle/fluid/imperative/tracer.h | 15 +++++++++++++ paddle/fluid/pybind/imperative.cc | 4 +++- paddle/fluid/pybind/pybind.cc | 24 +++++++++++++++------ python/paddle/fluid/imperative/layers.py | 13 +++++++++--- 6 files changed, 78 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 9813149865..53e949d9f9 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -27,6 +27,8 @@ namespace paddle { namespace imperative { +std::map py_funcs_; + using framework::Variable; void AddTo(Variable* src, Variable* dst) { @@ -183,5 +185,22 @@ void VarBase::RunBackward() { Autograd().RunBackward(this); } +void PyLayer::RegisterFunc(int func_id, const py::object& py_func) { + py_funcs_[func_id] = py_func; +} + +std::vector PyLayer::Apply(int func_id, + const std::vector& inputs) { + std::vector tensor_inputs; + std::vector ret; + + for (const VarBase& in : inputs) { + tensor_inputs.push_back(in.var_->Get()); + } + PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); + CallPythonFunc(py_funcs_[func_id], tensor_inputs, &ret); + return ret; +} + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index a0eee357c3..52cbb2c015 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -82,6 +82,7 @@ class PreparedOp { framework::OperatorWithKernel::OpKernelFunc func; platform::DeviceContext* dev_ctx; }; + class OpBase; class VarBase { @@ -128,7 +129,11 @@ class VarBase { class OpBase { public: - OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {} + OpBase() + : op_desc_(nullptr), + grad_op_desc_(nullptr), + forward_id_(-1), + backward_id_(-1) {} virtual ~OpBase() { if (grad_op_desc_) delete grad_op_desc_; @@ -139,6 +144,9 @@ class OpBase { framework::OpDesc* op_desc_; framework::OpDesc* grad_op_desc_; + int forward_id_; + int backward_id_; + std::map> input_vars_; std::map> output_vars_; std::map> pre_ops_; @@ -159,7 +167,7 @@ class Layer { } }; -static void CallPythonFunc(py::object* callable, +static void CallPythonFunc(const py::object& callable, const std::vector& ins, std::vector* outs) { py::gil_scoped_acquire guard; @@ -169,7 +177,7 @@ static void CallPythonFunc(py::object* callable, } // TODO(panyx0718): Who owns the returned LoDTensor. - auto ret = (*callable)(in_args); + auto ret = callable(in_args); auto ret_tuple = py::cast(ret); size_t ret_num = py::len(ret_tuple); for (size_t i = 0; i < ret_num; ++i) { @@ -192,17 +200,10 @@ class PyLayer { public: virtual ~PyLayer() {} - static std::vector Apply(py::object* callable, - const std::vector& inputs) { - std::vector tensor_inputs; - std::vector ret; + static void RegisterFunc(int func_id, const py::object& py_func); - for (const VarBase& in : inputs) { - tensor_inputs.push_back(in.var_->Get()); - } - CallPythonFunc(callable, tensor_inputs, &ret); - return ret; - } + static std::vector Apply(int func_id, + const std::vector& inputs); }; } // namespace imperative diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index c6eff86fac..1954c7a68a 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -172,6 +172,21 @@ class Tracer { op->block_ = block; } + std::vector PyTrace(OpBase* op, + const std::vector& inputs) { + std::vector outputs = PyLayer::Apply(op->forward_id_, inputs); + /* + for (const VarBase& inp : inputs) { + if (inp.pre_op_) { + op->pre_ops_[it.first].push_back(inp->pre_op_); + op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_); + } else { + op->pre_ops_[it.first].push_back(nullptr); + } + }*/ + return outputs; + } + private: framework::BlockDesc* root_block_; }; diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 5c1c7478f4..dbc7843caa 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -26,7 +26,9 @@ void BindTracer(pybind11::module *m) { [](imperative::Tracer &self, framework::BlockDesc *root_block) { new (&self) imperative::Tracer(root_block); }) - .def("trace", &imperative::Tracer::Trace); + .def("trace", &imperative::Tracer::Trace) + .def("py_trace", &imperative::Tracer::PyTrace, + pybind11::return_value_policy::take_ownership); } } // namespace pybind diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 684b931ee8..455bcc6a41 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -168,6 +168,13 @@ PYBIND11_MODULE(core, m) { self.op_desc_ = op_desc; } }, + py::return_value_policy::reference) + .def_property( + "forward_id", + [](const imperative::OpBase &self) { return self.forward_id_; }, + [](imperative::OpBase &self, int forward_id) { + self.forward_id_ = forward_id; + }, py::return_value_policy::reference); py::class_ layer(m, "Layer"); @@ -179,13 +186,16 @@ PYBIND11_MODULE(core, m) { py::class_(m, "PyLayer") .def(py::init<>()) - .def_static("apply", - [](py::object *callable, - const std::vector &inputs) - -> std::vector { - return imperative::PyLayer::Apply(callable, inputs); - }, - py::return_value_policy::take_ownership); + .def_static( + "apply", + [](int func_id, const std::vector &inputs) + -> std::vector { + return imperative::PyLayer::Apply(func_id, inputs); + }, + py::return_value_policy::take_ownership) + .def_static("register_func", [](int func_id, const py::object &callable) { + imperative::PyLayer::RegisterFunc(func_id, callable); + }); BindTracer(&m); diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 06b6d7ac06..40ec312b69 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -48,7 +48,6 @@ class Layer(core.Layer): raise ValueError("Layer shouldn't implement backward") -# TODO(panyx0718): Inherit from C++ base class. class PyLayer(core.PyLayer): """Layers composed of user-defined python codes.""" @@ -65,13 +64,21 @@ class PyLayer(core.PyLayer): @classmethod def __call__(cls, inputs): + tracer = framework._imperative_tracer() + block = framework.default_main_program().current_block() inputs = map(base.to_variable, inputs) inputs = [x._ivar for x in inputs] - ivars = core.PyLayer.apply(cls.forward, inputs) + + PyLayer.register_func(1, cls.forward) + + iop = core.OpBase() + iop.forward_id = 1 + block.ops.append(iop) + ivars = tracer.py_trace(iop, inputs) + # ivars = core.PyLayer.apply(cls.forward, inputs) ret = [] for ivar in ivars: tensor = ivar.value.get_tensor() - block = framework.default_main_program().current_block() py_var = framework.Variable( block, type=core.VarDesc.VarType.LOD_TENSOR, From 7aad6afd4994d4d077bca75d74807bf2d716a22b Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 9 Jan 2019 09:10:42 +0800 Subject: [PATCH 15/22] forward and backward test=develop --- paddle/fluid/imperative/layer.cc | 65 ++++++++++++------- paddle/fluid/imperative/layer.h | 35 +++++++++- paddle/fluid/imperative/tracer.h | 56 +++++++++++++--- paddle/fluid/pybind/pybind.cc | 9 ++- python/paddle/fluid/imperative/layers.py | 7 +- .../fluid/tests/unittests/test_imperative.py | 45 +++++++++++-- 6 files changed, 175 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 53e949d9f9..131e3e1bd5 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -57,6 +57,7 @@ class Autograd { if (var->stop_gradient_) { return; } + VLOG(3) << "start autograd"; std::deque ready; ready.push_back(var->pre_op_); @@ -122,11 +123,10 @@ framework::LoDTensor& VarBase::Grad() { } std::map> OpBase::ApplyGrad() { - if (!grad_op_desc_) { + if (!grad_op_desc_ && backward_id_ <= 0) { LOG(WARNING) << "op with no grad: " << op_desc_->Type(); return {}; } - VLOG(3) << "op grad " << grad_op_desc_->Type(); std::vector> tmp_vars; std::map> grad_outputs; @@ -142,23 +142,30 @@ std::map> OpBase::ApplyGrad() { } } - framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); - - // No need to do compile time infer shape here. - // grad_op_desc_->InferShape(*block_); - grad_op_desc_->InferVarType(block_); - - std::unique_ptr opbase = - framework::OpRegistry::CreateOp(*grad_op_desc_); - framework::OperatorWithKernel* op_kernel = - dynamic_cast(opbase.get()); - PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); - - framework::Scope scope; - platform::CPUPlace place; - PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); - p.op.RuntimeInferShape(scope, place, ctx); - p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + if (backward_id_ > 0) { + VLOG(3) << "py_layer_grad"; + PyLayer::ApplyGrad(backward_id_, grad_input_vars_["X@GRAD"], + &(grad_outputs["Out@GRAD"])); + } else { + VLOG(3) << "op grad " << grad_op_desc_->Type(); + framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); + + // No need to do compile time infer shape here. + // grad_op_desc_->InferShape(*block_); + grad_op_desc_->InferVarType(block_); + + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc_); + framework::OperatorWithKernel* op_kernel = + dynamic_cast(opbase.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + + framework::Scope scope; + platform::CPUPlace place; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); + p.op.RuntimeInferShape(scope, place, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + } for (auto it : grad_output_vars_) { auto& outputs = grad_outputs[it.first]; @@ -175,6 +182,7 @@ std::map> OpBase::ApplyGrad() { void VarBase::RunBackward() { if (!pre_op_) return; + VLOG(3) << "start backward"; auto grads_t = grads_->GetMutable(); float* data = grads_t->mutable_data(platform::CPUPlace()); std::fill(data, data + grads_t->numel(), 1.0); @@ -190,17 +198,30 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) { } std::vector PyLayer::Apply(int func_id, - const std::vector& inputs) { + const std::vector& inputs) { std::vector tensor_inputs; std::vector ret; - for (const VarBase& in : inputs) { - tensor_inputs.push_back(in.var_->Get()); + for (const VarBase* in : inputs) { + tensor_inputs.push_back(in->var_->Get()); } PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); CallPythonFunc(py_funcs_[func_id], tensor_inputs, &ret); return ret; } +void PyLayer::ApplyGrad(int func_id, + const std::vector& inputs, + std::vector* outputs) { + std::vector tensor_inputs; + std::vector ret; + + for (const Variable* in : inputs) { + tensor_inputs.push_back(in->Get()); + } + PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); + CallPythonFunc(py_funcs_[func_id], tensor_inputs, outputs); +} + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 52cbb2c015..84e04cb74e 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -196,6 +196,35 @@ static void CallPythonFunc(const py::object& callable, } } +static void CallPythonFunc(const py::object& callable, + const std::vector& ins, + std::vector* outs) { + py::gil_scoped_acquire guard; + py::tuple in_args(ins.size()); + for (size_t i = 0; i < ins.size(); ++i) { + in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr); + } + VLOG(3) << "pyfunc in " << py::len(in_args); + + // TODO(panyx0718): Who owns the returned LoDTensor. + auto ret = callable(in_args); + auto ret_tuple = py::cast(ret); + size_t ret_num = py::len(ret_tuple); + VLOG(3) << "pyfunc out " << ret_num; + for (size_t i = 0; i < ret_num; ++i) { + try { + auto* py_out_tensor = py::cast(ret_tuple[i]); + PADDLE_ENFORCE_NOT_NULL(py_out_tensor, + "Output tensor %d should not be nullptr", i); + auto* tensor = (*outs)[i]->GetMutable(); + tensor->ShareDataWith(*py_out_tensor); + tensor->set_lod(py_out_tensor->lod()); + } catch (py::cast_error&) { + PADDLE_THROW("The %d-th output must be LoDTensor", i); + } + } +} + class PyLayer { public: virtual ~PyLayer() {} @@ -203,7 +232,11 @@ class PyLayer { static void RegisterFunc(int func_id, const py::object& py_func); static std::vector Apply(int func_id, - const std::vector& inputs); + const std::vector& inputs); + + static void ApplyGrad(int func_id, + const std::vector& inputs, + std::vector* outputs); }; } // namespace imperative diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 1954c7a68a..f6aebea9bb 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -131,6 +131,7 @@ class Tracer { if (!stop_gradient) { framework::OpDesc* grad_op_desc; + // TODO(panyx): Is this leaked? auto grad_to_var = new std::unordered_map(); CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); op->grad_op_desc_ = grad_op_desc; @@ -143,12 +144,14 @@ class Tracer { if (var_it == grad_to_var->end()) { auto fwd_var_it = vars.find(grad_invar); PADDLE_ENFORCE(fwd_var_it != vars.end()); + // Forward inputs or outputs. grad_in_vars.push_back(fwd_var_it->second->var_); } else { VarBase* var = vars[var_it->second]; if (!var->grads_->IsInitialized()) { InitVar(var->var_, var->grads_); } + // Douts. grad_in_vars.push_back(var->grads_); } } @@ -172,18 +175,51 @@ class Tracer { op->block_ = block; } - std::vector PyTrace(OpBase* op, - const std::vector& inputs) { - std::vector outputs = PyLayer::Apply(op->forward_id_, inputs); - /* - for (const VarBase& inp : inputs) { - if (inp.pre_op_) { - op->pre_ops_[it.first].push_back(inp->pre_op_); - op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_); + std::vector PyTrace(OpBase* op, const std::vector& inputs, + bool stop_gradient = false) { + VLOG(3) << "py_trace"; + op->input_vars_["X"] = inputs; + op->output_vars_["Out"] = PyLayer::Apply(op->forward_id_, inputs); + for (VarBase* inp : inputs) { + if (inp->pre_op_) { + op->pre_ops_["X"].push_back(inp->pre_op_); + op->pre_ops_out_idx_["X"].push_back(inp->pre_op_out_idx_); } else { - op->pre_ops_[it.first].push_back(nullptr); + op->pre_ops_["X"].push_back(nullptr); } - }*/ + } + + auto& outputs = op->output_vars_["Out"]; + for (size_t i = 0; i < outputs.size(); ++i) { + VarBase* out = outputs[i]; + out->stop_gradient_ = stop_gradient; + out->pre_op_ = op; + out->pre_op_out_name_ = "Out"; + out->pre_op_out_idx_ = i; + } + if (!stop_gradient) { + auto& grad_input_vars = op->grad_input_vars_["X@GRAD"]; + auto& grad_output_vars = op->grad_output_vars_["Out@GRAD"]; + + for (const VarBase* inp : inputs) { + grad_input_vars.push_back(inp->var_); + } + for (VarBase* out : outputs) { + grad_input_vars.push_back(out->var_); + } + for (VarBase* out : outputs) { + grad_input_vars.push_back(out->grads_); + if (!grad_input_vars.back()->IsInitialized()) { + InitVar(out->var_, grad_input_vars.back()); + } + } + for (const VarBase* inp : inputs) { + grad_output_vars.push_back(inp->grads_); + if (!grad_output_vars.back()->IsInitialized()) { + InitVar(inp->var_, grad_output_vars.back()); + } + } + } return outputs; } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 455bcc6a41..93dd16c8c9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -175,6 +175,13 @@ PYBIND11_MODULE(core, m) { [](imperative::OpBase &self, int forward_id) { self.forward_id_ = forward_id; }, + py::return_value_policy::reference) + .def_property( + "backward_id", + [](const imperative::OpBase &self) { return self.backward_id_; }, + [](imperative::OpBase &self, int backward_id) { + self.backward_id_ = backward_id; + }, py::return_value_policy::reference); py::class_ layer(m, "Layer"); @@ -188,7 +195,7 @@ PYBIND11_MODULE(core, m) { .def(py::init<>()) .def_static( "apply", - [](int func_id, const std::vector &inputs) + [](int func_id, const std::vector &inputs) -> std::vector { return imperative::PyLayer::Apply(func_id, inputs); }, diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 40ec312b69..2b224b8dbb 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -59,22 +59,23 @@ class PyLayer(core.PyLayer): raise NotImplementedError @staticmethod - def backward(inputs): + def backward(douts): raise NotImplementedError @classmethod def __call__(cls, inputs): tracer = framework._imperative_tracer() block = framework.default_main_program().current_block() - inputs = map(base.to_variable, inputs) inputs = [x._ivar for x in inputs] PyLayer.register_func(1, cls.forward) + PyLayer.register_func(2, cls.backward) iop = core.OpBase() iop.forward_id = 1 + iop.backward_id = 2 block.ops.append(iop) - ivars = tracer.py_trace(iop, inputs) + ivars = tracer.py_trace(iop, inputs, False) # ivars = core.PyLayer.apply(cls.forward, inputs) ret = [] for ivar in ivars: diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 133e1e65c7..9f93ba9338 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -49,8 +49,18 @@ class MyPyLayer(fluid.imperative.PyLayer): return tuple([tensor]) @staticmethod - def backward(douts, outs): - return np.array(douts[0]) * (1 - np.square(np.array(outs[0]))) + def backward(inputs): + sys.stderr.write('calling into backward: %s\n' % str(inputs)) + inp, out, dout = inputs + inp = np.array(inp) + out = np.array(out) + dout = np.array(dout) + sys.stderr.write('calling into backward: %s, %s, %s\n' % + (inp, out, dout)) + ret = np.array(dout) * (1 - np.square(np.array(out))) + tensor = core.LoDTensor() + tensor.set(ret, core.CPUPlace()) + return tuple([tensor]) class MLP(fluid.imperative.Layer): @@ -71,20 +81,44 @@ class MLP(fluid.imperative.Layer): class TestImperative(unittest.TestCase): + """ def test_layer(self): with fluid.imperative.guard(): cl = core.Layer() cl.forward([]) l = fluid.imperative.Layer() self.assertRaises(NotImplementedError, l.forward, []) + """ def test_pylayer(self): + np_inp = np.ones([2, 2], np.float32) with fluid.imperative.guard(): my_py_layer = MyPyLayer() - outs = my_py_layer([np.ones([2, 2], np.float32)]) - sys.stderr.write('%s\n' % outs[0]._numpy()) - # out.backward() + var_inp = fluid.imperative.base.to_variable(np_inp) + outs = my_py_layer([var_inp]) + dy_out = np.sum(outs[0]._numpy()) + outs[0]._backward() + dy_grad = var_inp._gradient() + + with new_program_scope(): + inp = fluid.layers.data( + name="inp", shape=[2, 2], append_batch_size=False) + # TODO(panyx0718): Paddle doesn't diff against data `inp`. + x1 = inp * 1 + # TODO(panyx0718): If reduce_sum is skipped, the result is wrong. + x = fluid.layers.reduce_sum(fluid.layers.tanh(x1)) + param_grads = fluid.backward.append_backward( + x, parameter_list=[x1.name])[0] + exe = fluid.Executor(fluid.CPUPlace()) + + static_out, static_grad = exe.run( + feed={inp.name: np_inp}, + fetch_list=[x.name, param_grads[1].name]) + + self.assertTrue(np.allclose(dy_out, static_out)) + self.assertTrue(np.allclose(dy_grad, static_grad)) + """ def test_layer_in_out(self): np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) with fluid.imperative.guard(): @@ -138,6 +172,7 @@ class TestImperative(unittest.TestCase): self.assertTrue(np.allclose(dy_out, static_out)) self.assertTrue(np.allclose(dy_grad, static_grad)) + """ if __name__ == '__main__': From 9597fd05e9b1f669ad6102d67069f1c04e8840f8 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 9 Jan 2019 15:42:05 +0800 Subject: [PATCH 16/22] polish test=develop --- paddle/fluid/imperative/layer.cc | 90 ++++++++++++------- paddle/fluid/imperative/layer.h | 87 +++++------------- paddle/fluid/imperative/tracer.h | 5 +- paddle/fluid/pybind/pybind.cc | 10 ++- python/paddle/fluid/imperative/layers.py | 11 ++- .../fluid/tests/unittests/test_imperative.py | 44 ++++++++- 6 files changed, 136 insertions(+), 111 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 131e3e1bd5..d0aaa00c49 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -128,26 +128,23 @@ std::map> OpBase::ApplyGrad() { return {}; } - std::vector> tmp_vars; std::map> grad_outputs; - for (auto it : grad_output_vars_) { - auto& outputs = grad_outputs[it.first]; - for (size_t i = 0; i < it.second.size(); ++i) { - // Allocate a new variable - Variable* tmp_var = new framework::Variable(); - tmp_var->GetMutable(); - - tmp_vars.emplace_back(tmp_var); - outputs.push_back(tmp_var); - } - } - if (backward_id_ > 0) { VLOG(3) << "py_layer_grad"; - PyLayer::ApplyGrad(backward_id_, grad_input_vars_["X@GRAD"], - &(grad_outputs["Out@GRAD"])); + grad_outputs["Out@GRAD"] = + PyLayer::ApplyGrad(backward_id_, grad_input_vars_["X@GRAD"]); } else { VLOG(3) << "op grad " << grad_op_desc_->Type(); + for (auto it : grad_output_vars_) { + auto& outputs = grad_outputs[it.first]; + for (size_t i = 0; i < it.second.size(); ++i) { + // Allocate a new variable + Variable* tmp_var = new framework::Variable(); + tmp_var->GetMutable(); + outputs.push_back(tmp_var); + } + } + framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); // No need to do compile time infer shape here. @@ -170,10 +167,13 @@ std::map> OpBase::ApplyGrad() { for (auto it : grad_output_vars_) { auto& outputs = grad_outputs[it.first]; auto& origin_outputs = it.second; + PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); for (size_t i = 0; i < outputs.size(); ++i) { + framework::Variable* grad = outputs[i]; framework::Variable* orig_grad = origin_outputs[i]; - AddTo(outputs[i], orig_grad); + AddTo(grad, orig_grad); + delete grad; } } return input_vars_; @@ -197,30 +197,60 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) { py_funcs_[func_id] = py_func; } +int PyLayer::NumFuncs() { return py_funcs_.size(); } + std::vector PyLayer::Apply(int func_id, const std::vector& inputs) { - std::vector tensor_inputs; - std::vector ret; - + std::vector invars; for (const VarBase* in : inputs) { - tensor_inputs.push_back(in->var_->Get()); + invars.push_back(in->var_); } PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); - CallPythonFunc(py_funcs_[func_id], tensor_inputs, &ret); + std::vector outvars = CallPythonFunc(py_funcs_[func_id], invars); + std::vector ret; + for (Variable* v : outvars) { + ret.push_back(new VarBase(v, new Variable())); + } return ret; } -void PyLayer::ApplyGrad(int func_id, - const std::vector& inputs, - std::vector* outputs) { - std::vector tensor_inputs; - std::vector ret; +std::vector PyLayer::ApplyGrad( + int func_id, const std::vector& inputs) { + PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); + return CallPythonFunc(py_funcs_[func_id], inputs); +} - for (const Variable* in : inputs) { - tensor_inputs.push_back(in->Get()); +std::vector PyLayer::CallPythonFunc( + const py::object& callable, const std::vector& ins) { + py::gil_scoped_acquire guard; + py::tuple in_args(ins.size()); + for (size_t i = 0; i < ins.size(); ++i) { + const framework::LoDTensor& t = ins[i]->Get(); + in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr); } - PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); - CallPythonFunc(py_funcs_[func_id], tensor_inputs, outputs); + VLOG(3) << "pyfunc in " << py::len(in_args); + + // TODO(panyx0718): Who owns the returned LoDTensor. + auto ret = callable(in_args); + auto ret_tuple = py::cast(ret); + size_t ret_num = py::len(ret_tuple); + std::vector outs; + VLOG(3) << "pyfunc out " << ret_num; + for (size_t i = 0; i < ret_num; ++i) { + try { + auto* py_out_tensor = py::cast(ret_tuple[i]); + PADDLE_ENFORCE_NOT_NULL(py_out_tensor, + "Output tensor %d should not be nullptr", i); + auto* var = new framework::Variable(); + auto* tensor = var->GetMutable(); + tensor->ShareDataWith(*py_out_tensor); + tensor->set_lod(py_out_tensor->lod()); + outs.push_back(var); + } catch (py::cast_error&) { + PADDLE_THROW("The %d-th output must be LoDTensor", i); + } + } + return outs; } } // namespace imperative diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 84e04cb74e..4be0614c7e 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -87,12 +87,15 @@ class OpBase; class VarBase { public: - VarBase() + VarBase() : VarBase(new framework::Variable(), new framework::Variable()) {} + + // Owns `var` and `grad` + VarBase(framework::Variable* var, framework::Variable* grad) : pre_op_(nullptr), pre_op_out_idx_(-1), var_desc_(nullptr), - var_(new framework::Variable()), - grads_(new framework::Variable()), + var_(var), + grads_(grad), stop_gradient_(false) {} explicit VarBase(bool stop_gradient) @@ -131,8 +134,8 @@ class OpBase { public: OpBase() : op_desc_(nullptr), - grad_op_desc_(nullptr), forward_id_(-1), + grad_op_desc_(nullptr), backward_id_(-1) {} virtual ~OpBase() { @@ -141,10 +144,13 @@ class OpBase { std::map> ApplyGrad(); + // One of `op_desc_` or `forward_id_` is set, not both. + // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. framework::OpDesc* op_desc_; - framework::OpDesc* grad_op_desc_; - int forward_id_; + // When has backward, one of `grad_op_desc_` or `backward_id_` is set, + // not both. + framework::OpDesc* grad_op_desc_; int backward_id_; std::map> input_vars_; @@ -167,76 +173,23 @@ class Layer { } }; -static void CallPythonFunc(const py::object& callable, - const std::vector& ins, - std::vector* outs) { - py::gil_scoped_acquire guard; - py::tuple in_args(ins.size()); - for (size_t i = 0; i < ins.size(); ++i) { - in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr); - } - - // TODO(panyx0718): Who owns the returned LoDTensor. - auto ret = callable(in_args); - auto ret_tuple = py::cast(ret); - size_t ret_num = py::len(ret_tuple); - for (size_t i = 0; i < ret_num; ++i) { - try { - auto* py_out_tensor = py::cast(ret_tuple[i]); - PADDLE_ENFORCE_NOT_NULL(py_out_tensor, - "Output tensor %d should not be nullptr", i); - VarBase* var = new VarBase(); - auto* tensor = var->var_->GetMutable(); - tensor->ShareDataWith(*py_out_tensor); - tensor->set_lod(py_out_tensor->lod()); - outs->push_back(var); - } catch (py::cast_error&) { - PADDLE_THROW("The %d-th output must be LoDTensor", i); - } - } -} - -static void CallPythonFunc(const py::object& callable, - const std::vector& ins, - std::vector* outs) { - py::gil_scoped_acquire guard; - py::tuple in_args(ins.size()); - for (size_t i = 0; i < ins.size(); ++i) { - in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr); - } - VLOG(3) << "pyfunc in " << py::len(in_args); - - // TODO(panyx0718): Who owns the returned LoDTensor. - auto ret = callable(in_args); - auto ret_tuple = py::cast(ret); - size_t ret_num = py::len(ret_tuple); - VLOG(3) << "pyfunc out " << ret_num; - for (size_t i = 0; i < ret_num; ++i) { - try { - auto* py_out_tensor = py::cast(ret_tuple[i]); - PADDLE_ENFORCE_NOT_NULL(py_out_tensor, - "Output tensor %d should not be nullptr", i); - auto* tensor = (*outs)[i]->GetMutable(); - tensor->ShareDataWith(*py_out_tensor); - tensor->set_lod(py_out_tensor->lod()); - } catch (py::cast_error&) { - PADDLE_THROW("The %d-th output must be LoDTensor", i); - } - } -} - class PyLayer { public: virtual ~PyLayer() {} static void RegisterFunc(int func_id, const py::object& py_func); + static int NumFuncs(); + static std::vector Apply(int func_id, const std::vector& inputs); - static void ApplyGrad(int func_id, - const std::vector& inputs, - std::vector* outputs); + static std::vector ApplyGrad( + int func_id, const std::vector& inputs); + + private: + static std::vector CallPythonFunc( + const py::object& callable, const std::vector& ins); }; } // namespace imperative diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index f6aebea9bb..f68a67e5d7 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -132,8 +132,9 @@ class Tracer { if (!stop_gradient) { framework::OpDesc* grad_op_desc; // TODO(panyx): Is this leaked? - auto grad_to_var = new std::unordered_map(); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); + std::unique_ptr> grad_to_var( + new std::unordered_map()); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get()); op->grad_op_desc_ = grad_op_desc; for (auto it : grad_op_desc->Inputs()) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 93dd16c8c9..e52401ced7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -191,7 +191,7 @@ PYBIND11_MODULE(core, m) { return self.Forward(inputs); }); - py::class_(m, "PyLayer") + py::class_(m, "PyLayer") .def(py::init<>()) .def_static( "apply", @@ -200,9 +200,11 @@ PYBIND11_MODULE(core, m) { return imperative::PyLayer::Apply(func_id, inputs); }, py::return_value_policy::take_ownership) - .def_static("register_func", [](int func_id, const py::object &callable) { - imperative::PyLayer::RegisterFunc(func_id, callable); - }); + .def_static("register_func", + [](int func_id, const py::object &callable) { + imperative::PyLayer::RegisterFunc(func_id, callable); + }) + .def_static("num_funcs", &imperative::PyLayer::NumFuncs); BindTracer(&m); diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 2b224b8dbb..8027d9ba3b 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -68,12 +68,15 @@ class PyLayer(core.PyLayer): block = framework.default_main_program().current_block() inputs = [x._ivar for x in inputs] - PyLayer.register_func(1, cls.forward) - PyLayer.register_func(2, cls.backward) + if not hasattr(cls, 'forward_id'): + cls.forward_id = core.PyLayer.num_funcs() + 1 + PyLayer.register_func(cls.forward_id, cls.forward) + cls.backward_id = core.PyLayer.num_funcs() + 1 + PyLayer.register_func(cls.backward_id, cls.backward) iop = core.OpBase() - iop.forward_id = 1 - iop.backward_id = 2 + iop.forward_id = cls.forward_id + iop.backward_id = cls.backward_id block.ops.append(iop) ivars = tracer.py_trace(iop, inputs, False) # ivars = core.PyLayer.apply(cls.forward, inputs) diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 9f93ba9338..e3e1ce7ca3 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -81,14 +81,52 @@ class MLP(fluid.imperative.Layer): class TestImperative(unittest.TestCase): - """ def test_layer(self): with fluid.imperative.guard(): cl = core.Layer() cl.forward([]) l = fluid.imperative.Layer() self.assertRaises(NotImplementedError, l.forward, []) - """ + + def test_pylayer_func_id(self): + + with fluid.imperative.guard(): + + class PyLayer1(fluid.imperative.PyLayer): + def __init__(self): + super(PyLayer1, self).__init__() + + @staticmethod + def forward(inputs): + return inputs + + @staticmethod + def backward(inputs): + return inputs + + class PyLayer2(fluid.imperative.PyLayer): + def __init__(self): + super(PyLayer2, self).__init__() + + @staticmethod + def forward(inputs): + return inputs + + @staticmethod + def backward(inputs): + return inputs + + py_layer_1 = PyLayer1() + py_layer_2 = PyLayer2() + py_layer_1([fluid.imperative.base.to_variable(np.ones([2, 2]))]) + py_layer_2([fluid.imperative.base.to_variable(np.ones([2, 2]))]) + id = py_layer_1.forward_id + self.assertGreater(id, 0) + self.assertEqual(py_layer_1.backward_id, id + 1) + self.assertEqual(py_layer_2.forward_id, id + 2) + self.assertEqual(py_layer_2.backward_id, id + 3) + py_layer_1([fluid.imperative.base.to_variable(np.ones([2, 2]))]) + self.assertEqual(py_layer_1.forward_id, id) def test_pylayer(self): np_inp = np.ones([2, 2], np.float32) @@ -118,7 +156,6 @@ class TestImperative(unittest.TestCase): self.assertTrue(np.allclose(dy_out, static_out)) self.assertTrue(np.allclose(dy_grad, static_grad)) - """ def test_layer_in_out(self): np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) with fluid.imperative.guard(): @@ -172,7 +209,6 @@ class TestImperative(unittest.TestCase): self.assertTrue(np.allclose(dy_out, static_out)) self.assertTrue(np.allclose(dy_grad, static_grad)) - """ if __name__ == '__main__': From 439691f5bd03337f0d042fd81dca15fc7abcbd8f Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 10 Jan 2019 16:37:43 +0800 Subject: [PATCH 17/22] adjust the shlwapi on windows test=develop --- cmake/external/gflags.cmake | 9 ++++++++ cmake/generic.cmake | 22 +++++++++---------- .../inference/api/demo_ci/CMakeLists.txt | 4 ++-- paddle/fluid/inference/utils/CMakeLists.txt | 3 --- paddle/fluid/platform/float16.h | 2 +- paddle/fluid/pybind/CMakeLists.txt | 5 ++--- 6 files changed, 24 insertions(+), 21 deletions(-) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 4e98e4bf88..95ca16f57f 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags) LIST(APPEND external_project_dependencies gflags) +# On Windows (including MinGW), the Shlwapi library is used by gflags if available. +if (WIN32) + include(CheckIncludeFileCXX) + check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI) + if (HAVE_SHLWAPI) + set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib) + endif(HAVE_SHLWAPI) +endif (WIN32) + IF(WITH_C_API) INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags) IF(ANDROID) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 81a910100a..63820fd4f0 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME) add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) common_link(${TARGET_NAME}) endif() + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${os_dependency_modules}) endfunction(cc_binary) function(cc_test TARGET_NAME) @@ -367,18 +369,15 @@ function(cc_test TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${cc_test_SRCS}) if(WIN32) - list(APPEND win32_deps shlwapi) if("${cc_test_DEPS};" MATCHES "python;") list(REMOVE_ITEM cc_test_DEPS python) - list(APPEND win32_deps ${PYTHON_LIBRARIES}) + target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES}) endif() endif(WIN32) - add_executable(${TARGET_NAME} ${cc_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) - if(WIN32) - target_link_libraries(${TARGET_NAME} ${win32_deps}) - endif(WIN32) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) common_link(${TARGET_NAME}) add_test(NAME ${TARGET_NAME} @@ -451,10 +450,8 @@ function(nv_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) - if(WIN32) - target_link_libraries(${TARGET_NAME} shlwapi) - endif(WIN32) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules}) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) @@ -541,7 +538,8 @@ function(hip_test TARGET_NAME) endif() add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) - target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags ${os_dependency_modules}) add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index f42ee9a697..aaf137601c 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -128,8 +128,8 @@ else() ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash snappystream ${EXTERNAL_LIB}) - # NOTE(dzhwinter) shlwapi is deprecated. - set(DEPS ${DEPS} libcmt shlwapi) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + set(DEPS ${DEPS} libcmt ${os_dependency_modules}) endif(NOT WIN32) if(WITH_GPU) diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index cfb80fe6ec..c43eaf7f98 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce) cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) cc_binary(visualizer SRCS visualizer.cc DEPS analysis paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes) -if(WIN32) - target_link_libraries(visualizer shlwapi) -endif(WIN32) diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index 98afe843c0..c203f4e04a 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -59,7 +59,7 @@ limitations under the License. */ #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else -#define PADDLE_ALIGN(x) /*do nothing*/ +#define PADDLE_ALIGN(x) __declspec(align(x)) #endif namespace paddle { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 2545f5312f..ca2764e64f 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -21,9 +21,8 @@ if(WITH_PYTHON) endif(NOT APPLE AND NOT ANDROID AND NOT WIN32) endif(WITH_AMD_GPU) - if(WIN32) - target_link_libraries(paddle_pybind shlwapi) - endif(WIN32) + get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(paddle_pybind ${os_dependency_modules}) cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) endif(WITH_PYTHON) From c3a9f3c4b2c3465f8779cbed4ec8f86479647bde Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 10 Jan 2019 14:06:35 +0000 Subject: [PATCH 18/22] fix typo and refine test=develop --- paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc | 2 +- paddle/fluid/operators/jit/benchmark.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc index 96a3b7ee05..fa75e3b4aa 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc @@ -50,7 +50,7 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern, // the other one should be unused empty var. if (is_nth_input_var_of_concat(x->outputs[0], idx)) { satisfied_all = satisfied_all && x->outputs[1]->IsVar() && - x->outputs[1]->outputs.size() == 0; + x->outputs[1]->outputs.empty(); } else { satisfied_all = satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) && diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index bde2791add..4b4ce07fa7 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -52,11 +52,11 @@ struct BenchFunc { for (int i = 0; i < FLAGS_burning; ++i) { tgt(args...); } - auto start = paddle::platform::PosixInNsec() / 1e-3; + auto start = paddle::platform::PosixInNsec() * 1e-3; for (int i = 0; i < FLAGS_repeat; ++i) { tgt(args...); } - auto end = paddle::platform::PosixInNsec() / 1e-3; + auto end = paddle::platform::PosixInNsec() * 1e-3; return static_cast(end - start) / FLAGS_repeat; } }; From 3472f6735694bc6c4ccefff4ba1fc2220dd276e0 Mon Sep 17 00:00:00 2001 From: lujun Date: Fri, 11 Jan 2019 10:41:19 +0800 Subject: [PATCH 19/22] fix mnist-dataset bug at windows,test=develop --- python/paddle/dataset/mnist.py | 91 ++++++++++++++++------------------ 1 file changed, 43 insertions(+), 48 deletions(-) diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py index 38addd0cfd..847ca18720 100644 --- a/python/paddle/dataset/mnist.py +++ b/python/paddle/dataset/mnist.py @@ -21,10 +21,9 @@ parse training set and test set into paddle reader creators. from __future__ import print_function import paddle.dataset.common -import subprocess +import gzip import numpy -import platform -import tempfile +import struct from six.moves import range __all__ = ['train', 'test', 'convert'] @@ -41,51 +40,47 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432' def reader_creator(image_filename, label_filename, buffer_size): def reader(): - if platform.system() == 'Darwin': - zcat_cmd = 'gzcat' - elif platform.system() == 'Linux': - zcat_cmd = 'zcat' - else: - raise NotImplementedError() - - # According to http://stackoverflow.com/a/38061619/724872, we - # cannot use standard package gzip here. - tmp_image_file = tempfile.TemporaryFile(prefix='paddle_dataset') - m = subprocess.Popen( - [zcat_cmd, image_filename], stdout=tmp_image_file).communicate() - tmp_image_file.seek(16) # skip some magic bytes - - # Python3 will not take stdout as file - tmp_label_file = tempfile.TemporaryFile(prefix='paddle_dataset') - l = subprocess.Popen( - [zcat_cmd, label_filename], stdout=tmp_label_file).communicate() - tmp_label_file.seek(8) # skip some magic bytes - - try: # reader could be break. - while True: - labels = numpy.fromfile( - tmp_label_file, 'ubyte', count=buffer_size).astype("int") - - if labels.size != buffer_size: - break # numpy.fromfile returns empty slice after EOF. - - images = numpy.fromfile( - tmp_image_file, 'ubyte', count=buffer_size * 28 * - 28).reshape((buffer_size, 28 * 28)).astype('float32') - - images = images / 255.0 * 2.0 - 1.0 - - for i in range(buffer_size): - yield images[i, :], int(labels[i]) - finally: - try: - m.terminate() - except: - pass - try: - l.terminate() - except: - pass + with gzip.GzipFile(image_filename, 'rb') as image_file: + img_buf = image_file.read() + with gzip.GzipFile(label_filename, 'rb') as label_file: + lab_buf = label_file.read() + + step_label = 0 + + offset_img = 0 + # read from Big-endian + # get file info from magic byte + # image file : 16B + magic_byte_img = '>IIII' + magic_img, image_num, rows, cols = struct.unpack_from( + magic_byte_img, img_buf, offset_img) + offset_img += struct.calcsize(magic_byte_img) + + offset_lab = 0 + # label file : 8B + magic_byte_lab = '>II' + magic_lab, label_num = struct.unpack_from(magic_byte_lab, + lab_buf, offset_lab) + offset_lab += struct.calcsize(magic_byte_lab) + + while True: + if step_label >= label_num: + break + fmt_label = '>' + str(buffer_size) + 'B' + labels = struct.unpack_from(fmt_label, lab_buf, offset_lab) + offset_lab += struct.calcsize(fmt_label) + step_label += buffer_size + + fmt_images = '>' + str(buffer_size * rows * cols) + 'B' + images_temp = struct.unpack_from(fmt_images, img_buf, + offset_img) + images = numpy.reshape(images_temp, ( + buffer_size, rows * cols)).astype('float32') + offset_img += struct.calcsize(fmt_images) + + images = images / 255.0 * 2.0 - 1.0 + for i in range(buffer_size): + yield images[i, :], int(labels[i]) return reader From 358e657f68cfc4a2abc84ca4b1c46480ef09b171 Mon Sep 17 00:00:00 2001 From: chengduozh Date: Fri, 11 Jan 2019 12:45:47 +0800 Subject: [PATCH 20/22] Revert "Remove workspace_handle in conv_cudnn (#15186)" test=develop This reverts commit 064512aa47b9ea35c0b5479b32c1653512c8b7c4. --- paddle/fluid/framework/operator.h | 2 +- paddle/fluid/operators/conv_cudnn_op.cu.cc | 149 ++++++++---------- paddle/fluid/platform/device_context.cc | 28 ++-- paddle/fluid/platform/device_context.h | 2 +- paddle/fluid/platform/temporary_allocator.cc | 63 ++------ paddle/fluid/platform/temporary_allocator.h | 10 +- .../platform/temporary_allocator_test.cc | 58 +------ python/paddle/fluid/__init__.py | 3 +- 8 files changed, 107 insertions(+), 208 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 041187665a..4d29564aee 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -391,7 +391,7 @@ class ExecutionContext { PADDLE_ENFORCE( dynamic_cast(allocation_ptr) != nullptr, "The AllocationPtr must be TemporaryAllocation."); - PADDLE_ENFORCE_GE(allocation_ptr->size(), + PADDLE_ENFORCE_EQ(allocation_ptr->size(), framework::product(dim) * sizeof(T)); paddle::framework::Tensor temp_tensor( diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index f5208e7a60..25a723fc07 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -137,6 +137,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; auto handle = dev_ctx.cudnn_handle(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); bool half_float = false; #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) @@ -157,8 +158,6 @@ class CUDNNConvOpKernel : public framework::OpKernel { VLOG(5) << "NOT use cudnn_tensor_op_math"; } #endif - Tensor cudnn_workspace; - void* cudnn_workspace_ptr = nullptr; auto x_dims = framework::vectorize(input->dims()); auto f_dims = framework::vectorize(filter->dims()); @@ -181,26 +180,21 @@ class CUDNNConvOpKernel : public framework::OpKernel { .Var(kCUDNNFwdAlgoCache) ->GetMutable>(); } - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_limit)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - algo = algo_cache->GetAlgorithm( x_dims, f_dims, strides, paddings, dilations, 0, [&]() { int returned_algo_count; std::array fwd_perf_stat; - - CUDNN_ENFORCE( - platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - handle, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, cudnn_output_desc, - output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, - fwd_perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit)); + auto cudnn_find_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, + output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + fwd_perf_stat.data(), cudnn_workspace, + workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); VLOG(3) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { @@ -225,23 +219,17 @@ class CUDNNConvOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - // Allocate on GPU memory - if (!cudnn_workspace_ptr) { - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_in_bytes)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - } // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_filter_desc, filter_data + i * group_offset_filter, - cudnn_conv_desc, algo, cudnn_workspace_ptr, workspace_size_in_bytes, - &beta, cudnn_output_desc, output_data + i * group_offset_out)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_filter_desc, filter_data + i * group_offset_filter, + cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, + &beta, cudnn_output_desc, output_data + i * group_offset_out)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } }; @@ -365,20 +353,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { workspace_size_limit = max_user_size * 1024 * 1024; } - Tensor cudnn_workspace; - void* cudnn_workspace_ptr = nullptr; - if ((input_data || filter_data) && exhaustive_search) { - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_limit)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - } - auto x_dims = framework::vectorize(input->dims()); auto f_dims = framework::vectorize(filter->dims()); auto handle = dev_ctx.cudnn_handle(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); if (exhaustive_search) { @@ -396,22 +374,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { ->GetMutable< AlgorithmsCache>(); } - data_algo = data_algo_cache->GetAlgorithm( x_dims, f_dims, strides, paddings, dilations, 0, [&]() { int returned_algo_count; std::array data_perf_stat; - - CUDNN_ENFORCE(platform::dynload:: - cudnnFindConvolutionBackwardDataAlgorithmEx( - handle, cudnn_filter_desc, filter_data, - cudnn_output_grad_desc, output_grad_data, - cudnn_conv_desc, cudnn_input_desc, - input_grad_data, kNUM_CUDNN_BWD_DATA_ALGS, - &returned_algo_count, data_perf_stat.data(), - cudnn_workspace_ptr, workspace_size_limit)); + auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload:: + cudnnFindConvolutionBackwardDataAlgorithmEx( + handle, cudnn_filter_desc, filter_data, + cudnn_output_grad_desc, output_grad_data, + cudnn_conv_desc, cudnn_input_desc, input_grad_data, + kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, + data_perf_stat.data(), cudnn_workspace, + workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_bd_data_func, + workspace_size_limit); VLOG(3) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { @@ -462,23 +443,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { ->GetMutable< AlgorithmsCache>(); } - filter_algo = f_algo_cache->GetAlgorithm( x_dims, f_dims, strides, paddings, dilations, 0, [&]() { int returned_algo_count; std::array filter_perf_stat; - - CUDNN_ENFORCE( - platform::dynload:: - cudnnFindConvolutionBackwardFilterAlgorithmEx( - handle, cudnn_input_desc, input_data, - cudnn_output_grad_desc, output_grad_data, - cudnn_conv_desc, cudnn_filter_desc, filter_grad_data, - kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, - filter_perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit)); + auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload:: + cudnnFindConvolutionBackwardFilterAlgorithmEx( + handle, cudnn_input_desc, input_data, + cudnn_output_grad_desc, output_grad_data, + cudnn_conv_desc, cudnn_filter_desc, + filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS, + &returned_algo_count, filter_perf_stat.data(), + cudnn_workspace, workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_bd_f_func, + workspace_size_limit); return filter_perf_stat[0].algo; }); VLOG(3) << "cuDNN backward filter algo " << filter_algo; @@ -499,16 +482,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } - // ------------------- cudnn conv workspace --------------------- - if (!cudnn_workspace_ptr) { - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_in_bytes)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - } - // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; if (input_grad) { @@ -516,12 +489,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset input_grad. for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, - filter_data + i * group_offset_filter, cudnn_output_grad_desc, - output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, - cudnn_workspace_ptr, workspace_size_in_bytes, &beta, - cudnn_input_desc, input_grad_data + i * group_offset_in)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, + data_algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_input_desc, input_grad_data + i * group_offset_in)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } // ------------------- cudnn conv backward filter --------------------- @@ -529,12 +505,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_output_grad_desc, output_grad_data + i * group_offset_out, - cudnn_conv_desc, filter_algo, cudnn_workspace_ptr, - workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + i * group_offset_filter)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_input_desc, + input_data + i * group_offset_in, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, + filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + i * group_offset_filter)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } } diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 8f80a2d782..09f3d3de54 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -92,24 +92,26 @@ platform::TemporaryAllocator& DeviceTemporaryAllocator::Get( const platform::Place& place, const cudaStream_t& stream) { PADDLE_ENFORCE(platform::is_gpu_place(place)); auto place_stream = std::make_pair(place, stream); - std::unique_lock lock(mtx_); - auto it = device_allocator_.find(place_stream); - if (it == device_allocator_.end()) { - auto tmp_allocator = new TemporaryAllocator(place); - tmp_allocator->SetCallback([stream]() { - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE(cudaGetLastError()); - }); - device_allocator_[place_stream].reset(tmp_allocator); - return *tmp_allocator; - } else { - return *it->second; + { + std::unique_lock lock(mtx_); + if (!device_allocator_.count(place_stream)) { + device_allocator_[place_stream].reset(new TemporaryAllocator(place)); + device_allocator_[place_stream]->SetCallback([stream]() { + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE(cudaGetLastError()); + }); + } } + return *device_allocator_.at(place_stream); } template <> platform::TemporaryAllocator& DeviceTemporaryAllocator::Get( const platform::CUDADeviceContext& dev_ctx) { + auto place_stream = std::make_pair(dev_ctx.GetPlace(), dev_ctx.stream()); + if (device_allocator_.count(place_stream)) { + return *device_allocator_.at(place_stream); + } return Get(dev_ctx.GetPlace(), dev_ctx.stream()); } #endif @@ -323,7 +325,7 @@ Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { auto& allocator = DeviceTemporaryAllocator::Instance().Get(*this); - allocator.Release([this]() { + allocator.Release([=]() { PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaGetLastError()); }); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index d376f90ad5..c81d17380c 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -61,7 +61,7 @@ namespace platform { * the allocations of temp_allocation_queue: * - when the Stream calls cudaStreamSynchronize; * - when the allocation size of opportunities exceeds a certain threshold - * (defined by FLAGS_limit_of_tmp_allocation). + * (defined by FLAGS_limit_of_temporary_allocation). * * */ class DeviceTemporaryAllocator { diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc index 9cbdfe46e7..0be017f75b 100644 --- a/paddle/fluid/platform/temporary_allocator.cc +++ b/paddle/fluid/platform/temporary_allocator.cc @@ -15,15 +15,8 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" -DEFINE_int64(limit_of_tmp_allocation, -1, - "The up limit of temporary_allocation size."); -DEFINE_double(times_excess_than_required_tmp_allocation, 2, - "times_excess_than_required_tmp_allocation indicates the " - "max size the TemporaryAllocator can return. For example, " - "if the required memory size is N, and " - "times_excess_than_required_tmp_allocation is 2.0, " - "the TemporaryAllocator will return the available allocation " - "that the range of size is N ~ 2*N."); +DEFINE_double(limit_of_temporary_allocation, -1, + "The up limit of temporary_allocation size."); namespace paddle { namespace platform { @@ -36,25 +29,24 @@ TemporaryAllocation::TemporaryAllocation( underlying_allocation_(std::move(underlying_allocation)) {} TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) { - temp_mem_map_.reset(new std::multimap()); + temp_mem_queue_.reset(new std::deque()); } bool TemporaryAllocator::IsAllocThreadSafe() const { return true; } void TemporaryAllocator::Release(const std::function &callback) { - std::unique_ptr> t_allocations; + std::shared_ptr> t_allocations; { std::unique_lock lock(mtx_); callback(); - t_allocations.swap(temp_mem_map_); - temp_mem_map_.reset(new std::multimap()); + t_allocations = temp_mem_queue_; + temp_mem_queue_.reset(new std::deque()); wait_delete_mem_ = 0; } - for (auto tmp : *t_allocations) { - VLOG(10) << "Delete temporary allocation " << tmp.second->ptr() - << " size: " << tmp.second->size(); - delete tmp.second; + VLOG(10) << "Delete temporary allocation " << tmp->ptr() + << " size: " << tmp->size(); + delete tmp; } } @@ -62,34 +54,28 @@ void TemporaryAllocator::Free(alloc::Allocation *allocation) { auto *temp_allocation = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL(temp_allocation); if (platform::is_gpu_place(temp_allocation->place())) { - PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_), - "The place should be the same."); size_t wait_delete_mem = 0; { std::unique_lock lock(mtx_); - temp_mem_map_->emplace(temp_allocation->size(), temp_allocation); + temp_mem_queue_->emplace_back(temp_allocation); wait_delete_mem_ += temp_allocation->size(); wait_delete_mem = wait_delete_mem_; VLOG(10) << "Move temporary allocation: " << temp_allocation->ptr() << " to delete queue: " << temp_allocation->size() << "; " - << "wait_delete_mem: " << wait_delete_mem; + << "wait_delete_mem: " << wait_delete_mem_; } - - if (FLAGS_limit_of_tmp_allocation > 0 && - wait_delete_mem > static_cast(FLAGS_limit_of_tmp_allocation)) { - PADDLE_ENFORCE(callback_ != nullptr, "The callback is non-initialized."); + if (FLAGS_limit_of_temporary_allocation > 0 && + wait_delete_mem > FLAGS_limit_of_temporary_allocation) { Release(callback_); } return; } - VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr() - << " size: " << temp_allocation->size(); delete temp_allocation; } size_t TemporaryAllocator::TemporaryAllocationQueueSize() { std::unique_lock lock(mtx_); - return temp_mem_map_ ? temp_mem_map_->size() : 0; + return temp_mem_queue_ ? temp_mem_queue_->size() : 0; } void TemporaryAllocator::SetCallback(const std::function &callback) { @@ -98,27 +84,6 @@ void TemporaryAllocator::SetCallback(const std::function &callback) { alloc::Allocation *TemporaryAllocator::AllocateImpl( size_t size, alloc::Allocator::Attr attr) { - { - // Find available allocation in temp_mem_map. - std::unique_lock lock(mtx_); - if (temp_mem_map_->size()) { - auto it = temp_mem_map_->lower_bound(size); - // FIXME(zcd): Not sure the best value of excess fraction. - if (it != temp_mem_map_->end() && - it->first < - static_cast( - size * FLAGS_times_excess_than_required_tmp_allocation)) { - auto tmp_ptr = it->second; - temp_mem_map_->erase(it); - wait_delete_mem_ -= tmp_ptr->size(); - VLOG(10) << "Reuse temporary allocation: " << tmp_ptr->ptr() << ": " - << tmp_ptr->size(); - return tmp_ptr; - } - } - } - // If not find the the available allocation, get allocation from - // AllocatorFacadeInstance. auto raw_allocation = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); auto temp_mem = new TemporaryAllocation(std::move(raw_allocation)); diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h index d657a14223..812c4a3331 100644 --- a/paddle/fluid/platform/temporary_allocator.h +++ b/paddle/fluid/platform/temporary_allocator.h @@ -15,7 +15,6 @@ #pragma once #include // NOLINT #include -#include #include // NOLINT #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/lock_guard_ptr.h" @@ -40,7 +39,7 @@ class TemporaryAllocation : public memory::allocation::Allocation { * * There is one opportunity to free the allocations of temp_allocation_queue: * - when the allocation size of opportunities exceeds a certain threshold - * (defined by FLAGS_limit_of_tmp_allocation). + * (defined by FLAGS_limit_of_temporary_allocation). * * */ class TemporaryAllocator : public memory::allocation::Allocator { @@ -63,10 +62,11 @@ class TemporaryAllocator : public memory::allocation::Allocator { private: platform::Place place_; + // When the allocation is not held by any variable, it should be placed - // to temp_mem_map immediately. - std::unique_ptr> temp_mem_map_{ - nullptr}; + // to temp_mem_queue immediately. + std::shared_ptr> temp_mem_queue_{nullptr}; + std::mutex mtx_; size_t wait_delete_mem_{0}; std::function callback_; diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc index 3879cd5400..35d1d92981 100644 --- a/paddle/fluid/platform/temporary_allocator_test.cc +++ b/paddle/fluid/platform/temporary_allocator_test.cc @@ -18,8 +18,7 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor_util.h" -DECLARE_int64(limit_of_tmp_allocation); -DECLARE_double(times_excess_than_required_tmp_allocation); +DECLARE_double(limit_of_temporary_allocation); namespace paddle { namespace platform { @@ -36,7 +35,7 @@ class DummyOp : public framework::OperatorBase { const platform::Place& place) const override {} }; -TEST(temporary_allocator, test_base_function) { +TEST(temporary_allocator, temporary_allocator) { platform::CPUPlace cpu_place; TemporaryAllocator alloc(cpu_place); alloc.Allocate(100); @@ -60,10 +59,10 @@ TEST(temporary_allocator, test_base_function) { #endif } -TEST(temporary_allocator, test_flags_function) { +TEST(temporary_allocator, add_callback) { #ifdef PADDLE_WITH_CUDA - const int64_t limit = FLAGS_limit_of_tmp_allocation; - FLAGS_limit_of_tmp_allocation = 10; + const double limit = FLAGS_limit_of_temporary_allocation; + FLAGS_limit_of_temporary_allocation = 10; platform::CUDAPlace gpu_place(0); TemporaryAllocator gpu_alloc(gpu_place); @@ -79,52 +78,7 @@ TEST(temporary_allocator, test_flags_function) { }); { gpu_alloc.Allocate(100); } PADDLE_ENFORCE(deleted); - FLAGS_limit_of_tmp_allocation = limit; -#endif -} - -TEST(temporary_allocator, test_reuse_tmp_allocation) { -#ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); - gpu_alloc.SetCallback([]() {}); - - void* tmp_allocation_ptr1 = nullptr; - { - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - auto tmp_allocation1 = gpu_alloc.Allocate(100); - tmp_allocation_ptr1 = tmp_allocation1->ptr(); - } - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - auto tmp_allocation2 = gpu_alloc.Allocate(100); - void* tmp_allocation_ptr2 = tmp_allocation2->ptr(); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2); - - auto tmp_allocation3 = gpu_alloc.Allocate(100); - void* tmp_allocation_ptr3 = tmp_allocation2->ptr(); - PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr3); -#endif -} - -TEST(temporary_allocator, test_times_excess_than_required_tmp_allocation) { -#ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); - gpu_alloc.SetCallback([]() {}); - double excess_fraction = FLAGS_times_excess_than_required_tmp_allocation; - void* tmp_allocation_ptr1 = nullptr; - { - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - auto tmp_allocation1 = - gpu_alloc.Allocate(static_cast(100 * excess_fraction - 1)); - tmp_allocation_ptr1 = tmp_allocation1->ptr(); - } - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - auto tmp_allocation2 = gpu_alloc.Allocate(100); - void* tmp_allocation_ptr2 = tmp_allocation2->ptr(); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2); + FLAGS_limit_of_temporary_allocation = limit; #endif } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 686550a3c8..2c17716500 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -155,8 +155,7 @@ def __bootstrap__(): 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', - 'sync_nccl_allreduce', 'limit_of_tmp_allocation', - 'times_excess_than_required_tmp_allocation' + 'sync_nccl_allreduce' ] core.init_gflags([sys.argv[0]] + From c4eced9881bd6a5c4f47739f4a83c422620f11ac Mon Sep 17 00:00:00 2001 From: chengduozh Date: Fri, 11 Jan 2019 12:52:03 +0800 Subject: [PATCH 21/22] fix thread safe bug test=develop --- paddle/fluid/platform/device_context.cc | 28 ++++++++++++------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 09f3d3de54..8f80a2d782 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -92,26 +92,24 @@ platform::TemporaryAllocator& DeviceTemporaryAllocator::Get( const platform::Place& place, const cudaStream_t& stream) { PADDLE_ENFORCE(platform::is_gpu_place(place)); auto place_stream = std::make_pair(place, stream); - { - std::unique_lock lock(mtx_); - if (!device_allocator_.count(place_stream)) { - device_allocator_[place_stream].reset(new TemporaryAllocator(place)); - device_allocator_[place_stream]->SetCallback([stream]() { - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE(cudaGetLastError()); - }); - } + std::unique_lock lock(mtx_); + auto it = device_allocator_.find(place_stream); + if (it == device_allocator_.end()) { + auto tmp_allocator = new TemporaryAllocator(place); + tmp_allocator->SetCallback([stream]() { + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE(cudaGetLastError()); + }); + device_allocator_[place_stream].reset(tmp_allocator); + return *tmp_allocator; + } else { + return *it->second; } - return *device_allocator_.at(place_stream); } template <> platform::TemporaryAllocator& DeviceTemporaryAllocator::Get( const platform::CUDADeviceContext& dev_ctx) { - auto place_stream = std::make_pair(dev_ctx.GetPlace(), dev_ctx.stream()); - if (device_allocator_.count(place_stream)) { - return *device_allocator_.at(place_stream); - } return Get(dev_ctx.GetPlace(), dev_ctx.stream()); } #endif @@ -325,7 +323,7 @@ Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { auto& allocator = DeviceTemporaryAllocator::Instance().Get(*this); - allocator.Release([=]() { + allocator.Release([this]() { PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaGetLastError()); }); From 98e85f373578073657ef7c5a3efc4205e5353fce Mon Sep 17 00:00:00 2001 From: Zhaolong Xing Date: Fri, 11 Jan 2019 13:00:35 +0800 Subject: [PATCH 22/22] add_transpose_flatten_concat_fuse (#15121) --- paddle/fluid/framework/ir/CMakeLists.txt | 11 ++ .../framework/ir/graph_pattern_detector.cc | 63 ++++++++ .../framework/ir/graph_pattern_detector.h | 15 ++ .../ir/transpose_flatten_concat_fuse_pass.cc | 148 ++++++++++++++++++ .../ir/transpose_flatten_concat_fuse_pass.h | 38 +++++ paddle/fluid/inference/api/analysis_config.cc | 1 + .../fluid/inference/api/paddle_pass_builder.h | 4 + .../tensorrt/convert/elementwise_op.cc | 37 ++++- 8 files changed, 311 insertions(+), 6 deletions(-) create mode 100644 paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index a595a8ab42..42fb6a1aa5 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -48,6 +48,17 @@ pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference) +pass_library(transpose_flatten_concat_fuse_pass inference) + +# There may be many transpose-flatten structures in a model, and the output of +# these structures will be used as inputs to the concat Op. This pattern will +# be detected by our pass. The index here represents the number of structures in the +# pattern. We use index 3 ~ 6, because these quantities of structures are +# common in the models. +foreach (index RANGE 3 6) + file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n") +endforeach() + if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index c513fe2dd8..6282ced1e4 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()( return ac_out_var; } +// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a +// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b +// ... +// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z +// flatten_out_a -> concat_op flatten_out_b -> concat_op ... flatten_out_z -> +// concat_op +PDNode *patterns::TransposeFlattenConcat::operator()( + std::vector conv_in, int times) { + // The times represents the repeat times of the + // {trans, trans_out, flatten, flatten_out} + const int kNumFields = 4; + const int kTransOutOffset = 1; + const int kFlattenOffset = 2; + const int kFlattenOutOffset = 3; + + std::vector nodes; + + for (int i = 0; i < times; i++) { + nodes.push_back( + pattern->NewNode(GetNodeName("transpose" + std::to_string(i))) + ->assert_is_op("transpose2")); + nodes.push_back( + pattern->NewNode(GetNodeName("transpose_out" + std::to_string(i))) + ->assert_is_op_output("transpose2") + ->assert_is_op_input("flatten2", "X") + ->AsIntermediate()); + nodes.push_back(pattern->NewNode(GetNodeName("flatten" + std::to_string(i))) + ->assert_is_op("flatten2")); + + nodes.push_back( + pattern->NewNode(GetNodeName("flatten_out" + std::to_string(i))) + ->assert_is_op_output("flatten2") + ->assert_is_op_nth_input("concat", "X", i) + ->AsIntermediate()); + } + + auto concat_op = pattern->NewNode(GetNodeName("concat")) + ->assert_is_op("concat") + ->assert_op_has_n_inputs("concat", times); + auto concat_out = pattern->NewNode(GetNodeName("concat_out")) + ->assert_is_op_output("concat") + ->AsOutput(); + + std::vector flatten_outs; + for (int i = 0; i < times; i++) { + conv_in[i]->AsInput(); + // trans + nodes[i * kNumFields]->LinksFrom({conv_in[i]}); + // trans_out + nodes[i * kNumFields + kTransOutOffset]->LinksFrom({nodes[i * kNumFields]}); + // flatten + nodes[i * kNumFields + kFlattenOffset]->LinksFrom( + {nodes[i * kNumFields + kTransOutOffset]}); + // flatten_out + nodes[i * kNumFields + kFlattenOutOffset]->LinksFrom( + {nodes[i * kNumFields + kFlattenOffset]}); + flatten_outs.push_back(nodes[i * kNumFields + kFlattenOutOffset]); + } + + concat_op->LinksFrom(flatten_outs).LinksTo({concat_out}); + return concat_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 61a5300344..c8be586f54 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase { PATTERN_DECL_NODE(ac_out); // Out }; +struct TransposeFlattenConcat : public PatternBase { + TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "transpose_flatten_concat") {} + + PDNode* operator()(std::vector conv_inputs, int times); + + std::string GetNodeName(const std::string& op_type) { + return PDNodeName(name_scope_, repr_, id_, op_type); + } + + PDNode* GetPDNode(const std::string& op_type) { + return pattern->RetrieveNode(GetNodeName(op_type)); + } +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc new file mode 100644 index 0000000000..fda43948d5 --- /dev/null +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +template +std::unique_ptr TransposeFlattenConcatFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = + "transpose_flatten" + std::to_string(times) + "_concat_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + std::vector input_nodes; + for (int i = 0; i < times; i++) { + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(i)) + ->assert_is_op_input("transpose2", "X") + ->AsInput()); + } + + patterns::TransposeFlattenConcat pattern(gpd.mutable_pattern(), pattern_name); + pattern(input_nodes, times); + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + const int kNumFields = 5; + const int kTransOffset = 1; + const int kTransOutOffset = 2; + const int kFlattenOffset = 3; + const int kFlattenOutOffset = 4; + std::vector nodes; + + for (int i = 0; i < times; i++) { + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i)))); + PADDLE_ENFORCE(subgraph.at(input_nodes[i])); + + nodes.push_back(subgraph.at(input_nodes[i])); + nodes.push_back( + subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i)))); + } + + Node *concat_op = subgraph.at(pattern.GetPDNode("concat")); + Node *concat_out = subgraph.at(pattern.GetPDNode("concat_out")); + std::vector input_names; + std::vector trans_axis = boost::get>( + nodes[kTransOffset]->Op()->GetAttr("axis")); + int flatten_axis = + boost::get(nodes[kFlattenOffset]->Op()->GetAttr("axis")); + int concat_axis = boost::get(concat_op->Op()->GetAttr("axis")); + std::string output_name = concat_out->Name(); + + for (int i = 0; i < times; i++) { + input_names.push_back(nodes[i * kNumFields]->Name()); + } + + framework::OpDesc new_op_desc; + new_op_desc.SetType("fusion_transpose_flatten_concat"); + new_op_desc.SetInput("X", input_names); + new_op_desc.SetAttr("trans_axis", trans_axis); + new_op_desc.SetAttr("flatten_axis", flatten_axis); + new_op_desc.SetAttr("concat_axis", concat_axis); + new_op_desc.SetOutput("Out", {output_name}); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto *new_conv_op = graph->CreateOpNode(&new_op_desc); + + std::unordered_set delete_nodes; + + for (int i = 0; i < times; i++) { + nodes[i * kNumFields]->outputs.push_back(new_conv_op); + new_conv_op->inputs.push_back(nodes[i * kNumFields]); + delete_nodes.insert(nodes[i * kNumFields + kTransOffset]); + delete_nodes.insert(nodes[i * kNumFields + kTransOutOffset]); + delete_nodes.insert(nodes[i * kNumFields + kFlattenOffset]); + delete_nodes.insert(nodes[i * kNumFields + kFlattenOutOffset]); + } + delete_nodes.insert(concat_op); + + new_conv_op->outputs.push_back(concat_out); + concat_out->inputs.push_back(new_conv_op); + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), delete_nodes); + }; + + gpd(graph.get(), handler); + return graph; +} + +template class TransposeFlattenConcatFusePass<1>; +template class TransposeFlattenConcatFusePass<3>; +template class TransposeFlattenConcatFusePass<4>; +template class TransposeFlattenConcatFusePass<5>; +template class TransposeFlattenConcatFusePass<6>; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(transpose_flatten_concat_fuse_pass, + paddle::framework::ir::TransposeFlattenConcatFusePass<1>); + +REGISTER_PASS(transpose_flatten3_concat_fuse_pass, + paddle::framework::ir::TransposeFlattenConcatFusePass<3>); + +REGISTER_PASS(transpose_flatten4_concat_fuse_pass, + paddle::framework::ir::TransposeFlattenConcatFusePass<4>); + +REGISTER_PASS(transpose_flatten5_concat_fuse_pass, + paddle::framework::ir::TransposeFlattenConcatFusePass<5>); + +REGISTER_PASS(transpose_flatten6_concat_fuse_pass, + paddle::framework::ir::TransposeFlattenConcatFusePass<6>); diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h new file mode 100644 index 0000000000..fb0f0ae9ef --- /dev/null +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// There may be many transpose-flatten structures in a model, and the output of +// these structures will be used as inputs to the concat Op. This pattern will +// be detected by our pass. The times here represents the repeat times of this +// structure. +template +class TransposeFlattenConcatFusePass : public FusePassBase { + public: + virtual ~TransposeFlattenConcatFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 211c691504..336ab426c2 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; + Update(); } void contrib::AnalysisConfig::Update() { diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 1e5712e163..de9650735a 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -141,6 +141,10 @@ class GpuPassStrategy : public PassStrategy { "conv_elementwise_add_fuse_pass", // }); + for (int i = 6; i >= 3; i--) { + passes_.push_back("transpose_flatten" + std::to_string(i) + + "_concat_fuse_pass"); + } use_gpu_ = true; } diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 6975086193..79362f9677 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter { const framework::Scope& scope, bool test_mode) override { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. + nvinfer1::ILayer* layer = nullptr; framework::OpDesc op_desc(op, nullptr); VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer"; @@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter { 0}; TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, 0}; + if (op_type_ == "add") { + nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *X, scale_mode, shift_weights.get(), + scale_weights.get(), power_weights.get()); + layer = scale_layer; + } else if (op_type_ == "mul") { + nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *X, scale_mode, scale_weights.get(), + shift_weights.get(), power_weights.get()); + layer = scale_layer; + } - nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER( - engine_, Scale, *const_cast(X), scale_mode, - shift_weights.get(), scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Out")[0]; - - layer->setName(("elementwise_add (Output: " + output_name + ")").c_str()); + layer->setName( + ("elementwise_" + op_type_ + "(Output: " + output_name + ")").c_str()); layer->getOutput(0)->setName(output_name.c_str()); engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); engine_->SetITensor(output_name, layer->getOutput(0)); @@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter { engine_->DeclareOutput(output_name); } } + + protected: + std::string op_type_; }; class ElementwiseTensorOpConverter : public OpConverter { @@ -188,6 +200,16 @@ const std::unordered_map {"max", nvinfer1::ElementWiseOperation::kMAX}, }; +class ElementwiseWeightAddOpConverter : public ElementwiseWeightOpConverter { + public: + ElementwiseWeightAddOpConverter() { op_type_ = "add"; } +}; + +class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter { + public: + ElementwiseWeightMulOpConverter() { op_type_ = "mul"; } +}; + class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter { public: ElementwiseTensorAddOpConverter() { op_type_ = "add"; } @@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter { } // namespace inference } // namespace paddle -REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, + ElementwiseWeightAddOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight, + ElementwiseWeightMulOpConverter); REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor, ElementwiseTensorAddOpConverter);