From 4f6e9e3ac30ed4b3489394b79f0b1dd607432b93 Mon Sep 17 00:00:00 2001
From: heqiaozhi <heqiaozhi@baidu.com>
Date: Wed, 19 Dec 2018 10:48:55 +0800
Subject: [PATCH 01/22] teacher student sigmoid loss

---
 .../teacher_student_sigmoid_loss_op.cc        | 256 ++++++++++++++++++
 .../teacher_student_sigmoid_loss_op.h         |  25 ++
 python/paddle/fluid/layers/nn.py              |  42 +++
 .../test_teacher_student_sigmoid_loss_op.py   |  70 +++++
 4 files changed, 393 insertions(+)
 create mode 100644 paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
 create mode 100644 paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
new file mode 100644
index 0000000000..98eafb9f84
--- /dev/null
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+                      "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
+                      "The 2nd dimension of "
+                      "Input(Label) should be 1.");
+    ctx->SetOutputDim("Y", {x_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of
+  // teacher_student_sigmoid_loss
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+
+class TeacherStudentSigmoidLossGradientOp
+    : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
+                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+    PADDLE_ENFORCE_EQ(label_dims[1], 1,
+                      "When Attr(soft_label) == false, the 2nd dimension of "
+                      "Input(Label) should be 1.");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of
+  // teacher_student_sigmoid_loss
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+
+class TeacherStudentSigmoidLossOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x 1],"
+             " where N is the batch size and D is the output. "
+             "This input is a probability computed by the previous operator, "
+             "which is almost always the result of a softmax operator.");
+    AddInput("Label",
+             "(Tensor), the ground truth which is a 2-D tensor. "
+             "Label is a Tensor<float> with shape [N x 1]. ");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "[N x 1]. The teacher student sigmoid loss.");
+    AddAttr<float>("soft_max_up_bound", "fp32, default 15.0").SetDefault(15.0);
+    AddAttr<float>("soft_max_lower_bound", "fp32, default -15.0")
+        .SetDefault(-15.0);
+    AddComment(R"DOC(
+TeacherStudentSigmoidLoss Operator.
+TeacherStudentSigmoidLoss Operator.
+
+It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that
+we add another label(z') to original.
+        loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
+        z is click or not
+        z' is value q of feed_fine
+        label = {-2, -1, [0, 2]}
+        when z' is not exist, clk = 0 : label = -2;
+        when z' is not exist, clk = 1 : label = -1;
+        when z' is exist    , clk = 0 : label = 0 + z';
+        when z' is exist    , clk = 1 : label = 1 + z';
+
+)DOC");
+  }
+};
+
+// template <typename DeviceContext, typename T>
+template <typename T>
+class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
+                   "This kernel only runs on CPU.");
+
+    Tensor* y = context.Output<Tensor>("Y");
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    T* y_data = y->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    const T* label_data = labels->data<T>();
+    int64_t batch_size = x->dims()[0];
+    // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' +
+    // log(1 + exp(-abs(x)))
+    // z is click or not
+    // z' is value q of feed_fine
+    // label = {-2, -1, [0, 2]}
+    // when z' is not exist, clk = 0 : label = -2;
+    // when z' is not exist, clk = 1 : label = -1;
+    // when z' is exist    , clk = 0 : label = 0 + z';
+    // when z' is exist    , clk = 1 : label = 1 + z';
+    for (int i = 0; i < batch_size; ++i) {
+      if (label_data[i] < -1.0) {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      } else if (label_data[i] < 0.0) {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      } else if (label_data[i] < 1.0) {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
+                    log(1.0 + exp(-fabs(x_data[i]))) +
+                    (x_data[i] > 0 ? x_data[i] : 0.0) -
+                    x_data[i] * label_data[i] +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      } else {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
+                    log(1.0 + exp(-fabs(x_data[i]))) +
+                    (x_data[i] > 0 ? x_data[i] : 0.0) -
+                    x_data[i] * (label_data[i] - 1.0) +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      }
+    }
+  }
+};
+
+template <typename T>
+class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    const T* x_data = x->data<T>();
+
+    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(context.GetPlace());
+
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const T* label_data = labels->data<T>();
+
+    T soft_max_up_bound =
+        static_cast<T>(context.Attr<float>("soft_max_up_bound"));
+    T soft_max_lower_bound =
+        static_cast<T>(context.Attr<float>("soft_max_lower_bound"));
+
+    int64_t batch_size = x->dims()[0];
+
+    const framework::Tensor* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Y"));
+
+    const T* dout_data = dOut->data<T>();
+
+    for (int i = 0; i < batch_size; ++i) {
+      T sum_val = x_data[i];
+      if (sum_val > soft_max_up_bound) {
+        sum_val = soft_max_up_bound;
+      } else {
+        if (sum_val < soft_max_lower_bound) {
+          sum_val = soft_max_lower_bound;
+        }
+      }
+
+      T pred = 1.0 / (1.0 + exp(-sum_val));
+      if (label_data[i] < -1.0) {
+        dx_data[i] = 0.0 - pred;
+      } else if (label_data[i] < 0.0) {
+        dx_data[i] = 1.0 - pred;
+      } else {
+        dx_data[i] = label_data[i] - 2.0 * pred;
+      }
+      if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) {
+        dx_data[i] = 0;
+      }
+      dx_data[i] *= dout_data[i] * -1;
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(teacher_student_sigmoid_loss,
+                  ops::TeacherStudentSigmoidLossOp,
+                  ops::TeacherStudentSigmoidLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad,
+                  ops::TeacherStudentSigmoidLossGradientOp);
+
+REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss,
+                       ops::TeacherStudentSigmoidLossOpKernel<float>,
+                       ops::TeacherStudentSigmoidLossOpKernel<double>);
+
+REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss_grad,
+                       ops::TeacherStudentSigmoidLossGradOpKernel<float>,
+                       ops::TeacherStudentSigmoidLossGradOpKernel<double>);
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
new file mode 100644
index 0000000000..77b2760e9c
--- /dev/null
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 9e6cd1a0ab..68243cf744 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -176,6 +176,7 @@ __all__ = [
     'get_tensor_from_selected_rows',
     'lstm',
     'psroi_pool',
+    'teacher_student_sigmoid_loss',
 ]
 
 kIgnoreIndex = -100
@@ -9184,6 +9185,47 @@ def log_loss(input, label, epsilon=1e-4, name=None):
     return loss
 
 
+def teacher_student_sigmoid_loss(input,
+                                 label,
+                                 soft_max_up_bound=15.0,
+                                 soft_max_lower_bound=-15.0):
+    """
+    **Teacher Student Log Loss Layer**
+
+    This layer accepts input predictions and target label and returns the
+    teacher_student loss.
+
+    .. math::
+        loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
+                                batch size. This input is a probability computed
+                                by the previous operator.
+        label (Variable|list):  the ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size.
+        soft_max_up_bound (float):  if input > soft_max_up_bound, will be bound 
+        soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
+
+    Returns:
+        Variable: A 2-D tensor with shape [N x 1], the teacher_student_sigmoid_loss.
+
+    Examples:
+        .. code-block:: python
+          cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
+    """
+    helper = LayerHelper('teacher_student_sigmoid_loss', **locals())
+    out = helper.create_variable(dtype=input.dtype)
+    helper.append_op(
+        type='teacher_student_sigmoid_loss',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]},
+        attrs={"soft_max_lower_bound": float(soft_max_lower_bound), \
+                "soft_max_up_bound": float(soft_max_up_bound)})
+    return out
+
+
 def add_position_encoding(input, alpha, beta, name=None):
     """
     **Add Position Encoding Layer**
diff --git a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
new file mode 100644
index 0000000000..faa5163b32
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from math import log
+from math import exp
+from op_test import OpTest
+from scipy.special import logit
+from scipy.special import expit
+import unittest
+
+
+class TestTeacherStudentSigmoidLossOp(OpTest):
+    """
+        Test teacher_student_sigmoid_loss with discrete one-hot labels.
+    """
+
+    def setUp(self):
+        """
+            ut
+        """
+        self.op_type = "teacher_student_sigmoid_loss"
+        batch_size = 16
+        num_classes = 1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.uniform(0, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+        outs = []
+        for index, label in enumerate(self.inputs["Label"]):
+            x = self.inputs["X"][index]
+            if label < -1.0:
+                outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))))
+            elif label < 0.0:
+                outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))))
+            elif label < 1.0:
+                outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))) + \
+                            max(x, 0.0) - x * label + log(1.0 + exp(-abs(x))))
+                #print "33 python x:", x, "python label:", label, "term1:", max(x, 0.0) + log(1.0 + exp(-abs(x))), "term2:", max(x, 0.0) - x * label + log(1.0 + exp(-abs(x)))
+            else:
+                outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))) + \
+                            max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x))))
+                #print "44 python x:", x, "python label:", label, "term1:", max(x, 0.0) - x + log(1.0 + exp(-abs(x))), "term2:", max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x)))
+        self.outputs = {'Y': np.array(outs)}
+
+    def test_check_output(self):
+        """
+            ut
+        """
+        self.check_output()
+
+    def test_check_grad(self):
+        """
+            ut
+        """
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.005)

From b3cf476de414a3707a00f906422ddb956db7798f Mon Sep 17 00:00:00 2001
From: heqiaozhi <heqiaozhi@baidu.com>
Date: Wed, 19 Dec 2018 11:00:50 +0800
Subject: [PATCH 02/22] teacher student sigmoid loss test=develop

---
 python/paddle/fluid/layers/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 68243cf744..96fa503e77 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9204,7 +9204,7 @@ def teacher_student_sigmoid_loss(input,
                                 by the previous operator.
         label (Variable|list):  the ground truth which is a 2-D tensor with
                                 shape [N x 1], where N is the batch size.
-        soft_max_up_bound (float):  if input > soft_max_up_bound, will be bound 
+        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound 
         soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
 
     Returns:

From a94285869b42afa9581bff565ccfdae18c6cc7dd Mon Sep 17 00:00:00 2001
From: heqiaozhi <heqiaozhi@baidu.com>
Date: Wed, 19 Dec 2018 12:48:04 +0800
Subject: [PATCH 03/22] add API test=develop

---
 paddle/fluid/API.spec                                     | 1 +
 paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc | 2 +-
 paddle/fluid/operators/teacher_student_sigmoid_loss_op.h  | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 5e9901bb87..460ada8a00 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -74,6 +74,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr
 paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
 paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index 98eafb9f84..4b307140c5 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
index 77b2760e9c..f8e64c4d18 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From 754a5f8866da4fc3c324211a1ae7cb1746d5648a Mon Sep 17 00:00:00 2001
From: heqiaozhi <heqiaozhi@baidu.com>
Date: Wed, 19 Dec 2018 14:27:25 +0800
Subject: [PATCH 04/22] refine API test=develop

---
 paddle/fluid/API.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 460ada8a00..33cf1df2a0 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -74,7 +74,6 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr
 paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
-paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
 paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
@@ -210,6 +209,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
 paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))

From 39f4e9273ee07fefcbfab47d92d322dd3d654578 Mon Sep 17 00:00:00 2001
From: heqiaozhi <heqiaozhi@baidu.com>
Date: Wed, 19 Dec 2018 19:41:04 +0800
Subject: [PATCH 05/22] data_norm test=develop

---
 paddle/fluid/API.spec                  |   1 +
 paddle/fluid/operators/data_norm_op.cc | 409 +++++++++++++++++++++++++
 paddle/fluid/operators/data_norm_op.h  |  35 +++
 python/paddle/fluid/layers/nn.py       | 134 +++++++-
 4 files changed, 576 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/operators/data_norm_op.cc
 create mode 100644 paddle/fluid/operators/data_norm_op.h

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 33cf1df2a0..56b129a928 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -88,6 +88,7 @@ paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'poo
 paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
 paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
+paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, False, None, None, None, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
new file mode 100644
index 0000000000..d5bc25d19c
--- /dev/null
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -0,0 +1,409 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/data_norm_op.h"
+#include <string>
+#include "paddle/fluid/framework/data_layout.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+class DataNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSize"), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSum"), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Means"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Scales"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
+
+    const int64_t C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C);
+
+    ctx->SetOutputDim("Y", x_dims);
+    ctx->SetOutputDim("Means", {C});
+    ctx->SetOutputDim("Scales", {C});
+    ctx->ShareLoD("X", "Y");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type = ctx.Input<Tensor>("X")->type();
+    // By default, the type of the scale, bias, mean,
+    // and var tensors should both be float. (For float or float16 input tensor)
+    // or double (For double input tensor).
+    auto dn_param_type = framework::proto::VarType::FP32;
+    if (input_data_type == framework::proto::VarType::FP64) {
+      dn_param_type = framework::proto::VarType::FP64;
+    }
+    PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input<Tensor>("BatchSize")->type(),
+                      "BatchSize input should be of float type");
+    PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input<Tensor>("BatchSum")->type(),
+                      "BatchSum input should be of float type");
+    PADDLE_ENFORCE_EQ(dn_param_type,
+                      ctx.Input<Tensor>("BatchSquareSum")->type(),
+                      "BatchSquareSum input should be of float type");
+
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library);
+  }
+};
+
+class DataNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    // AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<float>("epsilon", "")
+        .SetDefault(1e-4)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
+    AddInput("X", "The input tensor");
+    AddInput("BatchSize",
+             "BatchSize is a 1-dimensional tensor of size C "
+             "that is applied to the output");
+    AddInput("BatchSum",
+             "BatchSum is a 1-dimensional tensor of size C "
+             "that is applied to the output");
+    AddInput("BatchSquareSum",
+             "The global BatchSquareSum (for training) or "
+             "estimated BatchSquareSum (for testing)");
+    AddOutput("Y", "result after normalization");
+    AddOutput("Means",
+              "Mean of the history data batch, "
+              "will apply to output when training")
+        .AsIntermediate();
+    AddOutput("Scales",
+              "Scales of the history data batch, "
+              "will apply to output when training")
+        .AsIntermediate();
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Data Normalization.
+
+Can be used as a normalizer function for data
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`
+
+)DOC");
+  }
+};
+
+template <typename T>
+class DataNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    // const bool is_test = ctx.Attr<bool>("is_test");
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2");
+    const int N = x_dims[0];
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean_out = ctx.Output<Tensor>("Means");
+    auto *scales = ctx.Output<Tensor>("Scales");
+
+    // alloc memory
+    y->mutable_data<T>(ctx.GetPlace());
+
+    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+    ConstEigenVectorArrayMap<T> b_size_arr(
+        ctx.Input<Tensor>("BatchSize")->data<T>(), C);
+    ConstEigenVectorArrayMap<T> b_sum_arr(
+        ctx.Input<Tensor>("BatchSum")->data<T>(), C);
+    ConstEigenVectorArrayMap<T> b_square_sum_arr(
+        ctx.Input<Tensor>("BatchSquareSum")->data<T>(), C);
+    EigenVectorArrayMap<T> means_arr(mean_out->mutable_data<T>(ctx.GetPlace()),
+                                     C);
+    EigenVectorArrayMap<T> scales_arr(scales->mutable_data<T>(ctx.GetPlace()),
+                                      C);
+    means_arr = b_sum_arr / b_size_arr;
+    scales_arr = (b_size_arr / b_square_sum_arr).sqrt();
+
+    switch (data_layout) {
+      case DataLayout::kNCHW:  // because it's two dimensions, so make no
+                               // difference
+      case DataLayout::kNHWC: {
+        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C, N) =
+            (ConstEigenArrayMap<T>(x->data<T>(), C, N).colwise() - means_arr)
+                .colwise() *
+            scales_arr;
+        break;
+      }
+      default:
+        PADDLE_THROW("Unknown storage order: %d", data_layout);
+    }
+  }
+};
+
+class DataNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSize"), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSum"), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Means"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Scales"), "");
+
+    // check output
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSize")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSum")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSquareSum")),
+                   "");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->SetOutputDim(framework::GradVarName("BatchSize"), {C});
+    ctx->SetOutputDim(framework::GradVarName("BatchSum"), {C});
+    ctx->SetOutputDim(framework::GradVarName("BatchSquareSum"), {C});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(), layout, library);
+  }
+};
+
+template <typename T>
+class DataNormGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *batch_size = ctx.Input<Tensor>("BatchSize");
+    const auto *batch_sum = ctx.Input<Tensor>("BatchSum");
+    const auto *batch_square_sum = ctx.Input<Tensor>("BatchSquareSum");
+    const auto *scales = ctx.Input<Tensor>("Scales");
+    const auto *means = ctx.Input<Tensor>("Means");
+
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    // Get the size for each dimension.
+    // NCHW [batch_size, in_channels, in_height, in_width]
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2");
+    const int N = x_dims[0];
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_batch_size =
+        ctx.Output<Tensor>(framework::GradVarName("BatchSize"));
+    auto *d_batch_sum = ctx.Output<Tensor>(framework::GradVarName("BatchSum"));
+    auto *d_batch_square_sum =
+        ctx.Output<Tensor>(framework::GradVarName("BatchSquareSum"));
+
+    EigenVectorArrayMap<T> d_batch_size_arr(
+        d_batch_size->mutable_data<T>(ctx.GetPlace()), C);
+    EigenVectorArrayMap<T> d_batch_sum_arr(
+        d_batch_sum->mutable_data<T>(ctx.GetPlace()), C);
+    EigenVectorArrayMap<T> d_batch_square_sum_arr(
+        d_batch_square_sum->mutable_data<T>(ctx.GetPlace()), C);
+
+    d_batch_size_arr.setZero();
+    d_batch_sum_arr.setZero();
+    d_batch_square_sum_arr.setZero();
+
+    const float epsilon = ctx.Attr<float>("epsilon");
+    switch (
+        data_layout) {  // because it's two dimensions, so make no difference
+      case DataLayout::kNCHW:
+      case DataLayout::kNHWC: {
+        ConstEigenVectorArrayMap<T> scales_arr(scales->data<T>(), C);
+        ConstEigenVectorArrayMap<T> means_arr(means->data<T>(), C);
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N);
+        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N);
+        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C, N);
+        d_x_arr.setZero();
+        for (int nc = 0; nc < N; ++nc) {
+          d_x_arr.col(nc) = d_y_arr.col(nc) * scales_arr;
+        }
+
+        // calculate data sum and squre sum
+        ConstEigenVectorArrayMap<T> batch_size_arr(batch_size->data<T>(), C);
+        ConstEigenVectorArrayMap<T> batch_sum_arr(batch_sum->data<T>(), C);
+        ConstEigenVectorArrayMap<T> batch_square_sum_arr(
+            batch_square_sum->data<T>(), C);
+        Eigen::Array<T, Eigen::Dynamic, 1> sample_sum(C);
+        Eigen::Array<T, Eigen::Dynamic, 1> sample_square_sum(C);
+        // calculate data sample sum and square sum
+        sample_sum.setZero();
+        sample_square_sum.setZero();
+        for (int nc = 0; nc < N; ++nc) {
+          sample_sum += x_arr.col(nc);
+          sample_square_sum += (x_arr.col(nc) - means_arr).square();
+        }
+        // calculate gradient
+        d_batch_size_arr.setConstant(N);
+        d_batch_sum_arr = sample_sum;
+        d_batch_square_sum_arr = sample_square_sum + d_batch_size_arr * epsilon;
+        break;
+      }
+      default:
+        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
+    }
+  }
+};
+
+class DataNormGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("data_norm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+
+    op->SetInput("BatchSize", Input("BatchSize"));
+    op->SetInput("BatchSum", Input("BatchSum"));
+    op->SetInput("BatchSquareSum", Input("BatchSquareSum"));
+    op->SetInput("Scales", Output("Scales"));
+    op->SetInput("Means", Output("Means"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("BatchSize"), InputGrad("BatchSize"));
+    op->SetOutput(framework::GradVarName("BatchSum"), InputGrad("BatchSum"));
+    op->SetOutput(framework::GradVarName("BatchSquareSum"),
+                  InputGrad("BatchSquareSum"));
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(data_norm, ops::DataNormOp, ops::DataNormOpMaker,
+                  ops::DataNormGradMaker);
+REGISTER_OPERATOR(data_norm_grad, ops::DataNormGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    data_norm, ops::DataNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DataNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    data_norm_grad,
+    ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/data_norm_op.h b/paddle/fluid/operators/data_norm_op.h
new file mode 100644
index 0000000000..63451214bc
--- /dev/null
+++ b/paddle/fluid/operators/data_norm_op.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class DataNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class DataNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 341f36763f..269d0086ef 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -55,6 +55,7 @@ __all__ = [
     'adaptive_pool2d',
     'adaptive_pool3d',
     'batch_norm',
+    'data_norm',
     'beam_search_decode',
     'conv2d_transpose',
     'conv3d_transpose',
@@ -2886,6 +2887,133 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
+def data_norm(input,
+              act=None,
+              epsilon=1e-05,
+              param_attr=None,
+              data_layout='NCHW',
+              in_place=False,
+              use_mkldnn=False,
+              name=None,
+              moving_mean_name=None,
+              moving_variance_name=None,
+              do_model_average_for_mean_and_var=False):
+    """
+    **Data Normalization Layer**
+
+    Can be used as a normalizer function for conv2d and fully_connected operations.
+    The required data format for this layer is one of the following:
+
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        epsilon(float, Default 1e-05):
+        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
+        data_layout(string, default NCHW): NCHW|NHWC
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        use_mkldnn(bool, Default false): ${use_mkldnn_comment}
+        name(string, Default None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
+        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
+        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+
+    Returns:
+        Variable: A tensor variable which is the result after applying data normalization on the input.
+
+    Examples:
+
+        .. code-block:: python
+
+            data = fluid.layers.data(input=x, size=200, param_attr='fc1.w')
+            hidden2 = fluid.layers.data_norm(input=hidden1)
+    """
+    helper = LayerHelper('data_norm', **locals())
+    dtype = helper.input_dtype()
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    param_shape = [channel_num]
+
+    batch_size_default = 1e4
+    batch_sum_default = 0.0
+    batch_square_sum_default = 1e4
+
+    if param_attr and isinstance(param_attr, dict):
+        batch_size_default = param_attr.get("batch_size", 1e4)
+        batch_sum_default = param_attr.get("batch_sum", 0.0)
+        batch_square_sum_default = param_attr.get("batch_square", 1e4)
+
+    # create parameter
+    batch_size = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_size',
+            initializer=Constant(value=float(batch_size_default)),
+            trainable=True),
+        shape=param_shape,
+        dtype=input.dtype)
+
+    batch_sum = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_sum',
+            initializer=Constant(value=float(batch_sum_default)),
+            trainable=True),
+        shape=param_shape,
+        dtype=input.dtype)
+
+    batch_square_sum = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_square_sum',
+            initializer=Constant(value=float(batch_square_sum_default)),
+            trainable=True),
+        shape=param_shape,
+        dtype=input.dtype)
+
+    means = helper.create_variable(dtype=dtype, stop_gradient=True)
+    scales = helper.create_variable(dtype=dtype, stop_gradient=True)
+
+    data_norm_out = input if in_place else helper.create_variable(dtype=dtype)
+
+    helper.append_op(
+        type="data_norm",
+        inputs={
+            "X": input,
+            "BatchSize": batch_size,
+            "BatchSum": batch_sum,
+            "BatchSquareSum": batch_square_sum
+        },
+        outputs={"Y": data_norm_out,
+                 "Means": means,
+                 "Scales": scales},
+        attrs={"epsilon": epsilon,
+               "use_mkldnn": use_mkldnn})
+
+    return helper.append_activation(data_norm_out)
+
+
 @templatedoc()
 def layer_norm(input,
                scale=True,
@@ -3054,9 +3182,9 @@ def group_norm(input,
         inputs['Bias'] = bias
 
     # create output
-    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
-    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
-    group_norm_out = helper.create_tmp_variable(dtype)
+    mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
+    group_norm_out = helper.create_variable(dtype)
 
     helper.append_op(
         type="group_norm",

From f18e8a7a5e9d9964f4fe1c60f53d32368b126d28 Mon Sep 17 00:00:00 2001
From: heqiaozhi <heqiaozhi@baidu.com>
Date: Thu, 20 Dec 2018 10:38:09 +0800
Subject: [PATCH 06/22] remove some comments & refine doc & put template class
 in .h test=develop

---
 .../teacher_student_sigmoid_loss_op.cc        | 110 ++----------------
 .../teacher_student_sigmoid_loss_op.h         |  93 +++++++++++++++
 .../test_teacher_student_sigmoid_loss_op.py   |  11 --
 3 files changed, 101 insertions(+), 113 deletions(-)

diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index 4b307140c5..c8ee13875c 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -115,18 +115,22 @@ class TeacherStudentSigmoidLossOpMaker
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape "
               "[N x 1]. The teacher student sigmoid loss.");
-    AddAttr<float>("soft_max_up_bound", "fp32, default 15.0").SetDefault(15.0);
-    AddAttr<float>("soft_max_lower_bound", "fp32, default -15.0")
+    AddAttr<float>(
+        "soft_max_up_bound",
+        "fp32, if input > soft_max_up_bound, will be bound, default 15.0")
+        .SetDefault(15.0);
+    AddAttr<float>(
+        "soft_max_lower_bound",
+        "fp32, if input < soft_max_lower_bound, will be bound, default -15.0")
         .SetDefault(-15.0);
     AddComment(R"DOC(
 TeacherStudentSigmoidLoss Operator.
-TeacherStudentSigmoidLoss Operator.
 
 It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that
 we add another label(z') to original.
         loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
         z is click or not
-        z' is value q of feed_fine
+        z' is teacher value 
         label = {-2, -1, [0, 2]}
         when z' is not exist, clk = 0 : label = -2;
         when z' is not exist, clk = 1 : label = -1;
@@ -137,104 +141,6 @@ we add another label(z') to original.
   }
 };
 
-// template <typename DeviceContext, typename T>
-template <typename T>
-class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
-                   "This kernel only runs on CPU.");
-
-    Tensor* y = context.Output<Tensor>("Y");
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* labels = context.Input<Tensor>("Label");
-    T* y_data = y->mutable_data<T>(context.GetPlace());
-    const T* x_data = x->data<T>();
-    const T* label_data = labels->data<T>();
-    int64_t batch_size = x->dims()[0];
-    // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' +
-    // log(1 + exp(-abs(x)))
-    // z is click or not
-    // z' is value q of feed_fine
-    // label = {-2, -1, [0, 2]}
-    // when z' is not exist, clk = 0 : label = -2;
-    // when z' is not exist, clk = 1 : label = -1;
-    // when z' is exist    , clk = 0 : label = 0 + z';
-    // when z' is exist    , clk = 1 : label = 1 + z';
-    for (int i = 0; i < batch_size; ++i) {
-      if (label_data[i] < -1.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else if (label_data[i] < 0.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else if (label_data[i] < 1.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
-                    log(1.0 + exp(-fabs(x_data[i]))) +
-                    (x_data[i] > 0 ? x_data[i] : 0.0) -
-                    x_data[i] * label_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i]))) +
-                    (x_data[i] > 0 ? x_data[i] : 0.0) -
-                    x_data[i] * (label_data[i] - 1.0) +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      }
-    }
-  }
-};
-
-template <typename T>
-class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    const T* x_data = x->data<T>();
-
-    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    T* dx_data = dx->mutable_data<T>(context.GetPlace());
-
-    const Tensor* labels = context.Input<Tensor>("Label");
-    const T* label_data = labels->data<T>();
-
-    T soft_max_up_bound =
-        static_cast<T>(context.Attr<float>("soft_max_up_bound"));
-    T soft_max_lower_bound =
-        static_cast<T>(context.Attr<float>("soft_max_lower_bound"));
-
-    int64_t batch_size = x->dims()[0];
-
-    const framework::Tensor* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Y"));
-
-    const T* dout_data = dOut->data<T>();
-
-    for (int i = 0; i < batch_size; ++i) {
-      T sum_val = x_data[i];
-      if (sum_val > soft_max_up_bound) {
-        sum_val = soft_max_up_bound;
-      } else {
-        if (sum_val < soft_max_lower_bound) {
-          sum_val = soft_max_lower_bound;
-        }
-      }
-
-      T pred = 1.0 / (1.0 + exp(-sum_val));
-      if (label_data[i] < -1.0) {
-        dx_data[i] = 0.0 - pred;
-      } else if (label_data[i] < 0.0) {
-        dx_data[i] = 1.0 - pred;
-      } else {
-        dx_data[i] = label_data[i] - 2.0 * pred;
-      }
-      if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) {
-        dx_data[i] = 0;
-      }
-      dx_data[i] *= dout_data[i] * -1;
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
index f8e64c4d18..41d2662ae2 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
@@ -20,6 +20,99 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+template <typename T>
+class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    Tensor* y = context.Output<Tensor>("Y");
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    T* y_data = y->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    const T* label_data = labels->data<T>();
+    int64_t batch_size = x->dims()[0];
+    // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' +
+    // log(1 + exp(-abs(x)))
+    // z is click or not
+    // z' is value q of feed_fine
+    // label = {-2, -1, [0, 2]}
+    // when z' is not exist, clk = 0 : label = -2;
+    // when z' is not exist, clk = 1 : label = -1;
+    // when z' is exist    , clk = 0 : label = 0 + z';
+    // when z' is exist    , clk = 1 : label = 1 + z';
+    for (int i = 0; i < batch_size; ++i) {
+      if (label_data[i] < -1.0) {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      } else if (label_data[i] < 0.0) {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      } else if (label_data[i] < 1.0) {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
+                    log(1.0 + exp(-fabs(x_data[i]))) +
+                    (x_data[i] > 0 ? x_data[i] : 0.0) -
+                    x_data[i] * label_data[i] +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      } else {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
+                    log(1.0 + exp(-fabs(x_data[i]))) +
+                    (x_data[i] > 0 ? x_data[i] : 0.0) -
+                    x_data[i] * (label_data[i] - 1.0) +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      }
+    }
+  }
+};
 
+template <typename T>
+class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    const T* x_data = x->data<T>();
+
+    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(context.GetPlace());
+
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const T* label_data = labels->data<T>();
+
+    T soft_max_up_bound =
+        static_cast<T>(context.Attr<float>("soft_max_up_bound"));
+    T soft_max_lower_bound =
+        static_cast<T>(context.Attr<float>("soft_max_lower_bound"));
+
+    int64_t batch_size = x->dims()[0];
+
+    const framework::Tensor* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Y"));
+
+    const T* dout_data = dOut->data<T>();
+
+    for (int i = 0; i < batch_size; ++i) {
+      T sum_val = x_data[i];
+      if (sum_val > soft_max_up_bound) {
+        sum_val = soft_max_up_bound;
+      } else {
+        if (sum_val < soft_max_lower_bound) {
+          sum_val = soft_max_lower_bound;
+        }
+      }
+
+      T pred = 1.0 / (1.0 + exp(-sum_val));
+      if (label_data[i] < -1.0) {
+        dx_data[i] = 0.0 - pred;
+      } else if (label_data[i] < 0.0) {
+        dx_data[i] = 1.0 - pred;
+      } else {
+        dx_data[i] = label_data[i] - 2.0 * pred;
+      }
+      if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) {
+        dx_data[i] = 0;
+      }
+      dx_data[i] *= dout_data[i] * -1;
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
index faa5163b32..26bf0fd883 100644
--- a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
@@ -27,9 +27,6 @@ class TestTeacherStudentSigmoidLossOp(OpTest):
     """
 
     def setUp(self):
-        """
-            ut
-        """
         self.op_type = "teacher_student_sigmoid_loss"
         batch_size = 16
         num_classes = 1
@@ -50,21 +47,13 @@ class TestTeacherStudentSigmoidLossOp(OpTest):
             elif label < 1.0:
                 outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))) + \
                             max(x, 0.0) - x * label + log(1.0 + exp(-abs(x))))
-                #print "33 python x:", x, "python label:", label, "term1:", max(x, 0.0) + log(1.0 + exp(-abs(x))), "term2:", max(x, 0.0) - x * label + log(1.0 + exp(-abs(x)))
             else:
                 outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))) + \
                             max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x))))
-                #print "44 python x:", x, "python label:", label, "term1:", max(x, 0.0) - x + log(1.0 + exp(-abs(x))), "term2:", max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x)))
         self.outputs = {'Y': np.array(outs)}
 
     def test_check_output(self):
-        """
-            ut
-        """
         self.check_output()
 
     def test_check_grad(self):
-        """
-            ut
-        """
         self.check_grad(["X"], "Y", numeric_grad_delta=0.005)

From 9ae50dd07d4c912e9cb78dfc5703cba808684343 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 3 Jan 2019 16:22:05 +0800
Subject: [PATCH 07/22] fix gpu buils issue on windows test=develop

---
 cmake/generic.cmake                       | 3 +++
 paddle/fluid/platform/cuda_helper_test.cu | 1 +
 2 files changed, 4 insertions(+)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c6fe2e970d..9419ffbad0 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -447,6 +447,9 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
+    if(WIN32)
+      list(APPEND nv_test_DEPS shlwapi)
+    endif(WIN32)
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     common_link(${TARGET_NAME})
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
index 466bf90c63..5b605fdc96 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -15,6 +15,7 @@
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <iostream>
+#include <map>
 #include <random>
 
 #define PADDLE_CUDA_FP16

From 25523bb8e67496e8f761f6da1351c034e1980759 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 3 Jan 2019 16:46:56 +0800
Subject: [PATCH 08/22] test=develop

---
 cmake/generic.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 9419ffbad0..49e8dcb70a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -447,10 +447,10 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     if(WIN32)
-      list(APPEND nv_test_DEPS shlwapi)
+      target_link_libraries(${TARGET_NAME} shlwapi)
     endif(WIN32)
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})

From fd4f4d0e5f5eb7eccfaf2a1234e66d612464a94b Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 3 Jan 2019 19:52:24 +0800
Subject: [PATCH 09/22] fix build issue test=develop

---
 paddle/fluid/platform/cuda_helper_test.cu | 2 +-
 paddle/fluid/platform/float16_test.cu     | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
index 5b605fdc96..ff49b92ff5 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -15,7 +15,7 @@
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <iostream>
-#include <map>
+#include <numeric>
 #include <random>
 
 #define PADDLE_CUDA_FP16
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index b1b51d804e..14cad927f0 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -271,11 +271,13 @@ TEST(float16, isinf) {
   float16 b = float16(INFINITY);
   // underflow to 0
   float16 native_a(5e-40f);
-  // overflow to inf
-  float16 native_b(5e40f);
   EXPECT_EQ(std::isinf(a), true);
   EXPECT_EQ(std::isinf(b), true);
+#ifndef _WIN32
+  // overflow to inf
+  float16 native_b(5e40f);
   EXPECT_EQ(std::isinf(native_b), true);
+#endif
   EXPECT_EQ(native_a, float16(0));
 }
 

From 0d0bc61248147d1f382a76e19520750ce4e63eb0 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 7 Jan 2019 18:42:42 +0800
Subject: [PATCH 10/22] update api

test=develop
---
 paddle/fluid/imperative/layer.h               |  5 +++-
 paddle/fluid/pybind/imperative.h              |  8 ++++---
 paddle/fluid/pybind/pybind.cc                 |  7 ++++--
 python/paddle/fluid/imperative/layers.py      | 23 ++++++++++++++++---
 python/paddle/fluid/imperative/nn.py          |  6 ++---
 .../fluid/tests/unittests/test_imperative.py  |  6 ++---
 .../unittests/test_imperative_optimizer.py    |  4 ++--
 7 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 2abda933cf..377ac3e1c5 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -153,7 +153,10 @@ class Layer {
     return vars;
   }
 
-  virtual void Backward() { LOG(ERROR) << "To support customize"; }
+  virtual std::vector<VarBase> Backward(const std::vector<VarBase>& inputs) {
+    std::vector<VarBase> vars;
+    return vars;
+  }
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
index 7a9d3a01ea..ef0d643954 100644
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
-class PyLayer : public imperative::Layer {
+class Layer : public imperative::Layer {
  public:
   using imperative::Layer::Layer;  // Inherit constructors
 
@@ -32,8 +32,10 @@ class PyLayer : public imperative::Layer {
                       inputs);  // NOLINT
   }
 
-  void Backward() override {
-    PYBIND11_OVERLOAD(void, Layer, Backward, );  // NOLINT
+  std::vector<imperative::VarBase> Backward(
+      const std::vector<imperative::VarBase>& inputs) override {
+    PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Backward,
+                      inputs);  // NOLINT
   }
 };
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3b81d59ad9..6e3c52da89 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -170,14 +170,17 @@ PYBIND11_MODULE(core, m) {
           },
           py::return_value_policy::reference);
 
-  py::class_<imperative::Layer, PyLayer /* <--- trampoline*/> layer(m, "Layer");
+  py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
   layer.def(py::init<>())
       .def("forward",
            [](imperative::Layer &self,
               const std::vector<imperative::VarBase> &inputs) {
              return self.Forward(inputs);
            })
-      .def("backward", &imperative::Layer::Backward);
+      .def("backward", [](imperative::Layer &self,
+                          const std::vector<imperative::VarBase> &inputs) {
+        return self.Backward(inputs);
+      });
   BindTracer(&m);
 
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index d78d61eb3f..1ebf79e052 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -20,10 +20,12 @@ from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.imperative import base
 
-__all__ = ['PyLayer']
+__all__ = ['Layer']
 
 
-class PyLayer(core.Layer):
+class Layer(core.Layer):
+    """Layers composed of operators."""
+
     def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
         self._once_built = False
         self._dtype = dtype
@@ -37,8 +39,23 @@ class PyLayer(core.Layer):
             self._once_built = True
 
         outputs = self.forward(*inputs)
-
         return outputs
 
     def forward(self, *inputs):
         raise NotImplementedError
+
+    def backward(self, *inputs):
+        raise ValueError("Layer shouldn't implement backward")
+
+
+class PyLayer(core.Layer):
+    """Layers composed of user-defined python codes."""
+
+    def __call__(self, *inputs):
+        pass
+
+    def forward(self, *inputs):
+        raise NotImplementedError
+
+    def backward(self, *inputs):
+        raise NotImplementedError
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 4f30417e99..8754e5d4d0 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -30,7 +30,7 @@ __all__ = [
 ]
 
 
-class Conv2D(layers.PyLayer):
+class Conv2D(layers.Layer):
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -143,7 +143,7 @@ class Conv2D(layers.PyLayer):
         return self._helper.append_activation(pre_act)
 
 
-class Pool2D(layers.PyLayer):
+class Pool2D(layers.Layer):
     def __init__(self,
                  pool_size=-1,
                  pool_type="max",
@@ -205,7 +205,7 @@ class Pool2D(layers.PyLayer):
         return pool_out
 
 
-class FC(layers.PyLayer):
+class FC(layers.Layer):
     def __init__(self,
                  size,
                  param_attr=None,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index 1dc13ec74e..44005411d1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -22,7 +22,7 @@ from paddle.fluid.imperative.nn import FC
 from test_imperative_base import new_program_scope
 
 
-class MyLayer(fluid.imperative.PyLayer):
+class MyLayer(fluid.imperative.Layer):
     def __init__(self):
         super(MyLayer, self).__init__()
 
@@ -34,7 +34,7 @@ class MyLayer(fluid.imperative.PyLayer):
         return [x]
 
 
-class MLP(fluid.imperative.PyLayer):
+class MLP(fluid.imperative.Layer):
     def __init__(self):
         super(MLP, self).__init__()
         self._fc1 = FC(3,
@@ -56,7 +56,7 @@ class TestImperative(unittest.TestCase):
         with fluid.imperative.guard():
             cl = core.Layer()
             cl.forward([])
-            l = fluid.imperative.PyLayer()
+            l = fluid.imperative.Layer()
             self.assertRaises(NotImplementedError, l.forward, [])
 
     def test_layer_in_out(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 5d97edf876..0549f50fe2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -26,7 +26,7 @@ from paddle.fluid.imperative.base import to_variable
 from test_imperative_base import new_program_scope
 
 
-class SimpleImgConvPool(fluid.imperative.PyLayer):
+class SimpleImgConvPool(fluid.imperative.Layer):
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -72,7 +72,7 @@ class SimpleImgConvPool(fluid.imperative.PyLayer):
         return x
 
 
-class MNIST(fluid.imperative.PyLayer):
+class MNIST(fluid.imperative.Layer):
     def __init__(self, param_attr=None, bias_attr=None):
         super(MNIST, self).__init__()
 

From b629133375728fcb0be71c2d6fcf7c9d83a2b391 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 8 Jan 2019 10:52:03 +0800
Subject: [PATCH 11/22] checkpoint runnable PyLayer

test=develop
---
 paddle/fluid/imperative/layer.h               | 49 +++++++++++++++++--
 paddle/fluid/pybind/imperative.h              |  6 ---
 paddle/fluid/pybind/pybind.cc                 | 21 +++++---
 python/paddle/fluid/imperative/layers.py      | 22 ++++++---
 .../fluid/tests/unittests/test_imperative.py  | 26 ++++++++++
 5 files changed, 101 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 377ac3e1c5..d76512af04 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -17,6 +17,9 @@
 #include <map>
 #include <string>
 #include <vector>
+#include "pybind11/pybind11.h"
+
+#include "Python.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
@@ -25,6 +28,8 @@
 namespace paddle {
 namespace imperative {
 
+namespace py = ::pybind11;
+
 class PreparedOp {
  public:
   PreparedOp(const framework::OperatorBase& op,
@@ -152,10 +157,48 @@ class Layer {
     std::vector<VarBase> vars;
     return vars;
   }
+};
 
-  virtual std::vector<VarBase> Backward(const std::vector<VarBase>& inputs) {
-    std::vector<VarBase> vars;
-    return vars;
+static void CallPythonFunc(py::object* callable,
+                           const std::vector<framework::LoDTensor>& ins,
+                           std::vector<framework::LoDTensor*>* outs) {
+  py::gil_scoped_acquire guard;
+  py::tuple in_args(ins.size());
+  for (size_t i = 0; i < ins.size(); ++i) {
+    in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr);
+  }
+
+  auto ret = (*callable)(in_args);
+  auto ret_tuple = py::cast<py::tuple>(ret);
+  size_t ret_num = py::len(ret_tuple);
+  for (size_t i = 0; i < ret_num; ++i) {
+    try {
+      auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
+      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
+                              "Output tensor %d should not be nullptr", i);
+      outs->push_back(py_out_tensor);
+    } catch (py::cast_error&) {
+      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+    }
+  }
+}
+
+class PyLayer {
+ public:
+  virtual ~PyLayer() {}
+
+  static std::vector<VarBase> Apply(py::object* callable,
+                                    const std::vector<VarBase>& inputs) {
+    std::vector<VarBase> outputs;
+    std::vector<framework::LoDTensor> tensor_inputs;
+    std::vector<framework::LoDTensor*> tensor_outputs;
+
+    for (const VarBase& in : inputs) {
+      tensor_inputs.push_back(in.var_->Get<framework::LoDTensor>());
+    }
+
+    CallPythonFunc(callable, tensor_inputs, &tensor_outputs);
+    return outputs;
   }
 };
 
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
index ef0d643954..f947b743f9 100644
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@@ -31,12 +31,6 @@ class Layer : public imperative::Layer {
     PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Forward,
                       inputs);  // NOLINT
   }
-
-  std::vector<imperative::VarBase> Backward(
-      const std::vector<imperative::VarBase>& inputs) override {
-    PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Backward,
-                      inputs);  // NOLINT
-  }
 };
 
 class PyOpBase : public imperative::OpBase {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6e3c52da89..d065818bc8 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -172,15 +172,20 @@ PYBIND11_MODULE(core, m) {
 
   py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
   layer.def(py::init<>())
-      .def("forward",
-           [](imperative::Layer &self,
-              const std::vector<imperative::VarBase> &inputs) {
-             return self.Forward(inputs);
-           })
-      .def("backward", [](imperative::Layer &self,
-                          const std::vector<imperative::VarBase> &inputs) {
-        return self.Backward(inputs);
+      .def("forward", [](imperative::Layer &self,
+                         const std::vector<imperative::VarBase> &inputs) {
+        return self.Forward(inputs);
       });
+
+  py::class_<paddle::imperative::PyLayer>(m, "PyLayer")
+      .def(py::init<>())
+      .def_static("apply",
+                  [](py::object *callable,
+                     const std::vector<imperative::VarBase> &inputs)
+                      -> std::vector<imperative::VarBase> {
+                        return imperative::PyLayer::Apply(callable, inputs);
+                      });
+
   BindTracer(&m);
 
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 1ebf79e052..7696427366 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -20,7 +20,7 @@ from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.imperative import base
 
-__all__ = ['Layer']
+__all__ = ['Layer', 'PyLayer']
 
 
 class Layer(core.Layer):
@@ -48,14 +48,24 @@ class Layer(core.Layer):
         raise ValueError("Layer shouldn't implement backward")
 
 
-class PyLayer(core.Layer):
+# TODO(panyx0718): Inherit from C++ base class.
+class PyLayer(core.PyLayer):
     """Layers composed of user-defined python codes."""
 
-    def __call__(self, *inputs):
-        pass
+    def __init__(self):
+        super(PyLayer, self).__init__()
 
-    def forward(self, *inputs):
+    @staticmethod
+    def forward(inputs):
         raise NotImplementedError
 
-    def backward(self, *inputs):
+    @staticmethod
+    def backward(inputs):
         raise NotImplementedError
+
+    @classmethod
+    def __call__(cls, inputs):
+        inputs = map(base.to_variable, inputs)
+        inputs = [x._ivar for x in inputs]
+        sys.stderr.write('%s\n' % inputs)
+        return core.PyLayer.apply(cls.forward, inputs)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index 44005411d1..ae99fb82e3 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -15,6 +15,7 @@
 import contextlib
 import unittest
 import numpy as np
+import sys
 
 import paddle.fluid as fluid
 from paddle.fluid import core
@@ -34,6 +35,24 @@ class MyLayer(fluid.imperative.Layer):
         return [x]
 
 
+class MyPyLayer(fluid.imperative.PyLayer):
+    def __init__(self):
+        super(MyPyLayer, self).__init__()
+
+    @staticmethod
+    def forward(inputs):
+        sys.stderr.write('before forward\n')
+        ret = np.tanh(inputs[0])
+        sys.stderr.write('after forward: %s\n' % ret)
+        tensor = core.LoDTensor()
+        tensor.set(ret, core.CPUPlace())
+        return tuple([tensor])
+
+    @staticmethod
+    def backward(douts, outs):
+        return np.array(douts[0]) * (1 - np.square(np.array(outs[0])))
+
+
 class MLP(fluid.imperative.Layer):
     def __init__(self):
         super(MLP, self).__init__()
@@ -59,6 +78,13 @@ class TestImperative(unittest.TestCase):
             l = fluid.imperative.Layer()
             self.assertRaises(NotImplementedError, l.forward, [])
 
+    def test_pylayer(self):
+        with fluid.imperative.guard():
+            my_py_layer = MyPyLayer()
+            out = my_py_layer([np.ones([2, 2], np.float32)])
+            sys.stderr.write('%s\n' % np.array(out))
+            # out.backward()
+
     def test_layer_in_out(self):
         np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.imperative.guard():

From d54133ea85b586b7ddf337e9af08145c7eba5949 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 8 Jan 2019 11:35:39 +0800
Subject: [PATCH 12/22] not include the numeric under linux test=develop

---
 paddle/fluid/platform/cuda_helper_test.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
index ff49b92ff5..9e3025bf30 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -15,7 +15,9 @@
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <iostream>
+#ifdef _WIN32
 #include <numeric>
+#endif
 #include <random>
 
 #define PADDLE_CUDA_FP16

From 11d4d39cd7ce5dd33df94ed65854eb357cad2400 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 8 Jan 2019 13:16:45 +0800
Subject: [PATCH 13/22] forward working

test=develop
---
 paddle/fluid/imperative/layer.h               | 21 +++++++++++--------
 paddle/fluid/pybind/pybind.cc                 |  5 +++--
 python/paddle/fluid/framework.py              |  5 ++++-
 python/paddle/fluid/imperative/layers.py      | 16 ++++++++++++--
 .../fluid/tests/unittests/test_imperative.py  |  4 ++--
 5 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index d76512af04..a0eee357c3 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -161,13 +161,14 @@ class Layer {
 
 static void CallPythonFunc(py::object* callable,
                            const std::vector<framework::LoDTensor>& ins,
-                           std::vector<framework::LoDTensor*>* outs) {
+                           std::vector<VarBase*>* outs) {
   py::gil_scoped_acquire guard;
   py::tuple in_args(ins.size());
   for (size_t i = 0; i < ins.size(); ++i) {
     in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr);
   }
 
+  // TODO(panyx0718): Who owns the returned LoDTensor.
   auto ret = (*callable)(in_args);
   auto ret_tuple = py::cast<py::tuple>(ret);
   size_t ret_num = py::len(ret_tuple);
@@ -176,7 +177,11 @@ static void CallPythonFunc(py::object* callable,
       auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
       PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
                               "Output tensor %d should not be nullptr", i);
-      outs->push_back(py_out_tensor);
+      VarBase* var = new VarBase();
+      auto* tensor = var->var_->GetMutable<framework::LoDTensor>();
+      tensor->ShareDataWith(*py_out_tensor);
+      tensor->set_lod(py_out_tensor->lod());
+      outs->push_back(var);
     } catch (py::cast_error&) {
       PADDLE_THROW("The %d-th output must be LoDTensor", i);
     }
@@ -187,18 +192,16 @@ class PyLayer {
  public:
   virtual ~PyLayer() {}
 
-  static std::vector<VarBase> Apply(py::object* callable,
-                                    const std::vector<VarBase>& inputs) {
-    std::vector<VarBase> outputs;
+  static std::vector<VarBase*> Apply(py::object* callable,
+                                     const std::vector<VarBase>& inputs) {
     std::vector<framework::LoDTensor> tensor_inputs;
-    std::vector<framework::LoDTensor*> tensor_outputs;
+    std::vector<VarBase*> ret;
 
     for (const VarBase& in : inputs) {
       tensor_inputs.push_back(in.var_->Get<framework::LoDTensor>());
     }
-
-    CallPythonFunc(callable, tensor_inputs, &tensor_outputs);
-    return outputs;
+    CallPythonFunc(callable, tensor_inputs, &ret);
+    return ret;
   }
 };
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d065818bc8..684b931ee8 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -182,9 +182,10 @@ PYBIND11_MODULE(core, m) {
       .def_static("apply",
                   [](py::object *callable,
                      const std::vector<imperative::VarBase> &inputs)
-                      -> std::vector<imperative::VarBase> {
+                      -> std::vector<imperative::VarBase *> {
                         return imperative::PyLayer::Apply(callable, inputs);
-                      });
+                      },
+                  py::return_value_policy::take_ownership);
 
   BindTracer(&m);
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 70767c962f..8e18dffac3 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -372,7 +372,10 @@ class Variable(object):
         self.stop_gradient = stop_gradient
         self.is_data = is_data
         if _in_imperative_mode():
-            self._ivar = core.VarBase()
+            if 'ivar' in kwargs:
+                self._ivar = kwargs['ivar']
+            else:
+                self._ivar = core.VarBase()
             self._ivar.desc = self.desc
             self._ivar.stop_gradient = stop_gradient
 
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 7696427366..06b6d7ac06 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -67,5 +67,17 @@ class PyLayer(core.PyLayer):
     def __call__(cls, inputs):
         inputs = map(base.to_variable, inputs)
         inputs = [x._ivar for x in inputs]
-        sys.stderr.write('%s\n' % inputs)
-        return core.PyLayer.apply(cls.forward, inputs)
+        ivars = core.PyLayer.apply(cls.forward, inputs)
+        ret = []
+        for ivar in ivars:
+            tensor = ivar.value.get_tensor()
+            block = framework.default_main_program().current_block()
+            py_var = framework.Variable(
+                block,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                name=None,
+                shape=tensor.shape(),
+                dtype=tensor._dtype(),
+                ivar=ivar)
+            ret.append(py_var)
+        return ret
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index ae99fb82e3..133e1e65c7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -81,8 +81,8 @@ class TestImperative(unittest.TestCase):
     def test_pylayer(self):
         with fluid.imperative.guard():
             my_py_layer = MyPyLayer()
-            out = my_py_layer([np.ones([2, 2], np.float32)])
-            sys.stderr.write('%s\n' % np.array(out))
+            outs = my_py_layer([np.ones([2, 2], np.float32)])
+            sys.stderr.write('%s\n' % outs[0]._numpy())
             # out.backward()
 
     def test_layer_in_out(self):

From 2349acea48f76cc527d9a47ff6a0b5da5e77aa2f Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 8 Jan 2019 16:41:20 +0800
Subject: [PATCH 14/22] checkpoint

test=develop
---
 paddle/fluid/imperative/layer.cc         | 19 +++++++++++++++++
 paddle/fluid/imperative/layer.h          | 27 ++++++++++++------------
 paddle/fluid/imperative/tracer.h         | 15 +++++++++++++
 paddle/fluid/pybind/imperative.cc        |  4 +++-
 paddle/fluid/pybind/pybind.cc            | 24 +++++++++++++++------
 python/paddle/fluid/imperative/layers.py | 13 +++++++++---
 6 files changed, 78 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 9813149865..53e949d9f9 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -27,6 +27,8 @@
 namespace paddle {
 namespace imperative {
 
+std::map<int, py::object> py_funcs_;
+
 using framework::Variable;
 
 void AddTo(Variable* src, Variable* dst) {
@@ -183,5 +185,22 @@ void VarBase::RunBackward() {
   Autograd().RunBackward(this);
 }
 
+void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
+  py_funcs_[func_id] = py_func;
+}
+
+std::vector<VarBase*> PyLayer::Apply(int func_id,
+                                     const std::vector<VarBase>& inputs) {
+  std::vector<framework::LoDTensor> tensor_inputs;
+  std::vector<VarBase*> ret;
+
+  for (const VarBase& in : inputs) {
+    tensor_inputs.push_back(in.var_->Get<framework::LoDTensor>());
+  }
+  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
+  CallPythonFunc(py_funcs_[func_id], tensor_inputs, &ret);
+  return ret;
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index a0eee357c3..52cbb2c015 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -82,6 +82,7 @@ class PreparedOp {
   framework::OperatorWithKernel::OpKernelFunc func;
   platform::DeviceContext* dev_ctx;
 };
+
 class OpBase;
 
 class VarBase {
@@ -128,7 +129,11 @@ class VarBase {
 
 class OpBase {
  public:
-  OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {}
+  OpBase()
+      : op_desc_(nullptr),
+        grad_op_desc_(nullptr),
+        forward_id_(-1),
+        backward_id_(-1) {}
 
   virtual ~OpBase() {
     if (grad_op_desc_) delete grad_op_desc_;
@@ -139,6 +144,9 @@ class OpBase {
   framework::OpDesc* op_desc_;
   framework::OpDesc* grad_op_desc_;
 
+  int forward_id_;
+  int backward_id_;
+
   std::map<std::string, std::vector<VarBase*>> input_vars_;
   std::map<std::string, std::vector<VarBase*>> output_vars_;
   std::map<std::string, std::vector<OpBase*>> pre_ops_;
@@ -159,7 +167,7 @@ class Layer {
   }
 };
 
-static void CallPythonFunc(py::object* callable,
+static void CallPythonFunc(const py::object& callable,
                            const std::vector<framework::LoDTensor>& ins,
                            std::vector<VarBase*>* outs) {
   py::gil_scoped_acquire guard;
@@ -169,7 +177,7 @@ static void CallPythonFunc(py::object* callable,
   }
 
   // TODO(panyx0718): Who owns the returned LoDTensor.
-  auto ret = (*callable)(in_args);
+  auto ret = callable(in_args);
   auto ret_tuple = py::cast<py::tuple>(ret);
   size_t ret_num = py::len(ret_tuple);
   for (size_t i = 0; i < ret_num; ++i) {
@@ -192,17 +200,10 @@ class PyLayer {
  public:
   virtual ~PyLayer() {}
 
-  static std::vector<VarBase*> Apply(py::object* callable,
-                                     const std::vector<VarBase>& inputs) {
-    std::vector<framework::LoDTensor> tensor_inputs;
-    std::vector<VarBase*> ret;
+  static void RegisterFunc(int func_id, const py::object& py_func);
 
-    for (const VarBase& in : inputs) {
-      tensor_inputs.push_back(in.var_->Get<framework::LoDTensor>());
-    }
-    CallPythonFunc(callable, tensor_inputs, &ret);
-    return ret;
-  }
+  static std::vector<VarBase*> Apply(int func_id,
+                                     const std::vector<VarBase>& inputs);
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index c6eff86fac..1954c7a68a 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -172,6 +172,21 @@ class Tracer {
     op->block_ = block;
   }
 
+  std::vector<VarBase*> PyTrace(OpBase* op,
+                                const std::vector<VarBase>& inputs) {
+    std::vector<VarBase*> outputs = PyLayer::Apply(op->forward_id_, inputs);
+    /*
+    for (const VarBase& inp : inputs) {
+      if (inp.pre_op_) {
+        op->pre_ops_[it.first].push_back(inp->pre_op_);
+        op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
+      } else {
+        op->pre_ops_[it.first].push_back(nullptr);
+      }
+    }*/
+    return outputs;
+  }
+
  private:
   framework::BlockDesc* root_block_;
 };
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 5c1c7478f4..dbc7843caa 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -26,7 +26,9 @@ void BindTracer(pybind11::module *m) {
            [](imperative::Tracer &self, framework::BlockDesc *root_block) {
              new (&self) imperative::Tracer(root_block);
            })
-      .def("trace", &imperative::Tracer::Trace);
+      .def("trace", &imperative::Tracer::Trace)
+      .def("py_trace", &imperative::Tracer::PyTrace,
+           pybind11::return_value_policy::take_ownership);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 684b931ee8..455bcc6a41 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -168,6 +168,13 @@ PYBIND11_MODULE(core, m) {
               self.op_desc_ = op_desc;
             }
           },
+          py::return_value_policy::reference)
+      .def_property(
+          "forward_id",
+          [](const imperative::OpBase &self) { return self.forward_id_; },
+          [](imperative::OpBase &self, int forward_id) {
+            self.forward_id_ = forward_id;
+          },
           py::return_value_policy::reference);
 
   py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
@@ -179,13 +186,16 @@ PYBIND11_MODULE(core, m) {
 
   py::class_<paddle::imperative::PyLayer>(m, "PyLayer")
       .def(py::init<>())
-      .def_static("apply",
-                  [](py::object *callable,
-                     const std::vector<imperative::VarBase> &inputs)
-                      -> std::vector<imperative::VarBase *> {
-                        return imperative::PyLayer::Apply(callable, inputs);
-                      },
-                  py::return_value_policy::take_ownership);
+      .def_static(
+          "apply",
+          [](int func_id, const std::vector<imperative::VarBase> &inputs)
+              -> std::vector<imperative::VarBase *> {
+                return imperative::PyLayer::Apply(func_id, inputs);
+              },
+          py::return_value_policy::take_ownership)
+      .def_static("register_func", [](int func_id, const py::object &callable) {
+        imperative::PyLayer::RegisterFunc(func_id, callable);
+      });
 
   BindTracer(&m);
 
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 06b6d7ac06..40ec312b69 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -48,7 +48,6 @@ class Layer(core.Layer):
         raise ValueError("Layer shouldn't implement backward")
 
 
-# TODO(panyx0718): Inherit from C++ base class.
 class PyLayer(core.PyLayer):
     """Layers composed of user-defined python codes."""
 
@@ -65,13 +64,21 @@ class PyLayer(core.PyLayer):
 
     @classmethod
     def __call__(cls, inputs):
+        tracer = framework._imperative_tracer()
+        block = framework.default_main_program().current_block()
         inputs = map(base.to_variable, inputs)
         inputs = [x._ivar for x in inputs]
-        ivars = core.PyLayer.apply(cls.forward, inputs)
+
+        PyLayer.register_func(1, cls.forward)
+
+        iop = core.OpBase()
+        iop.forward_id = 1
+        block.ops.append(iop)
+        ivars = tracer.py_trace(iop, inputs)
+        # ivars = core.PyLayer.apply(cls.forward, inputs)
         ret = []
         for ivar in ivars:
             tensor = ivar.value.get_tensor()
-            block = framework.default_main_program().current_block()
             py_var = framework.Variable(
                 block,
                 type=core.VarDesc.VarType.LOD_TENSOR,

From 7aad6afd4994d4d077bca75d74807bf2d716a22b Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 9 Jan 2019 09:10:42 +0800
Subject: [PATCH 15/22] forward and backward

test=develop
---
 paddle/fluid/imperative/layer.cc              | 65 ++++++++++++-------
 paddle/fluid/imperative/layer.h               | 35 +++++++++-
 paddle/fluid/imperative/tracer.h              | 56 +++++++++++++---
 paddle/fluid/pybind/pybind.cc                 |  9 ++-
 python/paddle/fluid/imperative/layers.py      |  7 +-
 .../fluid/tests/unittests/test_imperative.py  | 45 +++++++++++--
 6 files changed, 175 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 53e949d9f9..131e3e1bd5 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -57,6 +57,7 @@ class Autograd {
     if (var->stop_gradient_) {
       return;
     }
+    VLOG(3) << "start autograd";
 
     std::deque<OpBase*> ready;
     ready.push_back(var->pre_op_);
@@ -122,11 +123,10 @@ framework::LoDTensor& VarBase::Grad() {
 }
 
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  if (!grad_op_desc_) {
+  if (!grad_op_desc_ && backward_id_ <= 0) {
     LOG(WARNING) << "op with no grad: " << op_desc_->Type();
     return {};
   }
-  VLOG(3) << "op grad " << grad_op_desc_->Type();
 
   std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
   std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
@@ -142,23 +142,30 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
     }
   }
 
-  framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
-
-  // No need to do compile time infer shape here.
-  // grad_op_desc_->InferShape(*block_);
-  grad_op_desc_->InferVarType(block_);
-
-  std::unique_ptr<framework::OperatorBase> opbase =
-      framework::OpRegistry::CreateOp(*grad_op_desc_);
-  framework::OperatorWithKernel* op_kernel =
-      dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
-  PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
-
-  framework::Scope scope;
-  platform::CPUPlace place;
-  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
-  p.op.RuntimeInferShape(scope, place, ctx);
-  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+  if (backward_id_ > 0) {
+    VLOG(3) << "py_layer_grad";
+    PyLayer::ApplyGrad(backward_id_, grad_input_vars_["X@GRAD"],
+                       &(grad_outputs["Out@GRAD"]));
+  } else {
+    VLOG(3) << "op grad " << grad_op_desc_->Type();
+    framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
+
+    // No need to do compile time infer shape here.
+    // grad_op_desc_->InferShape(*block_);
+    grad_op_desc_->InferVarType(block_);
+
+    std::unique_ptr<framework::OperatorBase> opbase =
+        framework::OpRegistry::CreateOp(*grad_op_desc_);
+    framework::OperatorWithKernel* op_kernel =
+        dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
+    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+
+    framework::Scope scope;
+    platform::CPUPlace place;
+    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+    p.op.RuntimeInferShape(scope, place, ctx);
+    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+  }
 
   for (auto it : grad_output_vars_) {
     auto& outputs = grad_outputs[it.first];
@@ -175,6 +182,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
 void VarBase::RunBackward() {
   if (!pre_op_) return;
 
+  VLOG(3) << "start backward";
   auto grads_t = grads_->GetMutable<framework::LoDTensor>();
   float* data = grads_t->mutable_data<float>(platform::CPUPlace());
   std::fill(data, data + grads_t->numel(), 1.0);
@@ -190,17 +198,30 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
 }
 
 std::vector<VarBase*> PyLayer::Apply(int func_id,
-                                     const std::vector<VarBase>& inputs) {
+                                     const std::vector<VarBase*>& inputs) {
   std::vector<framework::LoDTensor> tensor_inputs;
   std::vector<VarBase*> ret;
 
-  for (const VarBase& in : inputs) {
-    tensor_inputs.push_back(in.var_->Get<framework::LoDTensor>());
+  for (const VarBase* in : inputs) {
+    tensor_inputs.push_back(in->var_->Get<framework::LoDTensor>());
   }
   PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
   CallPythonFunc(py_funcs_[func_id], tensor_inputs, &ret);
   return ret;
 }
 
+void PyLayer::ApplyGrad(int func_id,
+                        const std::vector<framework::Variable*>& inputs,
+                        std::vector<framework::Variable*>* outputs) {
+  std::vector<framework::LoDTensor> tensor_inputs;
+  std::vector<VarBase*> ret;
+
+  for (const Variable* in : inputs) {
+    tensor_inputs.push_back(in->Get<framework::LoDTensor>());
+  }
+  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
+  CallPythonFunc(py_funcs_[func_id], tensor_inputs, outputs);
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 52cbb2c015..84e04cb74e 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -196,6 +196,35 @@ static void CallPythonFunc(const py::object& callable,
   }
 }
 
+static void CallPythonFunc(const py::object& callable,
+                           const std::vector<framework::LoDTensor>& ins,
+                           std::vector<framework::Variable*>* outs) {
+  py::gil_scoped_acquire guard;
+  py::tuple in_args(ins.size());
+  for (size_t i = 0; i < ins.size(); ++i) {
+    in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr);
+  }
+  VLOG(3) << "pyfunc in " << py::len(in_args);
+
+  // TODO(panyx0718): Who owns the returned LoDTensor.
+  auto ret = callable(in_args);
+  auto ret_tuple = py::cast<py::tuple>(ret);
+  size_t ret_num = py::len(ret_tuple);
+  VLOG(3) << "pyfunc out " << ret_num;
+  for (size_t i = 0; i < ret_num; ++i) {
+    try {
+      auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
+      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
+                              "Output tensor %d should not be nullptr", i);
+      auto* tensor = (*outs)[i]->GetMutable<framework::LoDTensor>();
+      tensor->ShareDataWith(*py_out_tensor);
+      tensor->set_lod(py_out_tensor->lod());
+    } catch (py::cast_error&) {
+      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+    }
+  }
+}
+
 class PyLayer {
  public:
   virtual ~PyLayer() {}
@@ -203,7 +232,11 @@ class PyLayer {
   static void RegisterFunc(int func_id, const py::object& py_func);
 
   static std::vector<VarBase*> Apply(int func_id,
-                                     const std::vector<VarBase>& inputs);
+                                     const std::vector<VarBase*>& inputs);
+
+  static void ApplyGrad(int func_id,
+                        const std::vector<framework::Variable*>& inputs,
+                        std::vector<framework::Variable*>* outputs);
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 1954c7a68a..f6aebea9bb 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -131,6 +131,7 @@ class Tracer {
 
     if (!stop_gradient) {
       framework::OpDesc* grad_op_desc;
+      // TODO(panyx): Is this leaked?
       auto grad_to_var = new std::unordered_map<std::string, std::string>();
       CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
       op->grad_op_desc_ = grad_op_desc;
@@ -143,12 +144,14 @@ class Tracer {
           if (var_it == grad_to_var->end()) {
             auto fwd_var_it = vars.find(grad_invar);
             PADDLE_ENFORCE(fwd_var_it != vars.end());
+            // Forward inputs or outputs.
             grad_in_vars.push_back(fwd_var_it->second->var_);
           } else {
             VarBase* var = vars[var_it->second];
             if (!var->grads_->IsInitialized()) {
               InitVar(var->var_, var->grads_);
             }
+            // Douts.
             grad_in_vars.push_back(var->grads_);
           }
         }
@@ -172,18 +175,51 @@ class Tracer {
     op->block_ = block;
   }
 
-  std::vector<VarBase*> PyTrace(OpBase* op,
-                                const std::vector<VarBase>& inputs) {
-    std::vector<VarBase*> outputs = PyLayer::Apply(op->forward_id_, inputs);
-    /*
-    for (const VarBase& inp : inputs) {
-      if (inp.pre_op_) {
-        op->pre_ops_[it.first].push_back(inp->pre_op_);
-        op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
+  std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
+                                bool stop_gradient = false) {
+    VLOG(3) << "py_trace";
+    op->input_vars_["X"] = inputs;
+    op->output_vars_["Out"] = PyLayer::Apply(op->forward_id_, inputs);
+    for (VarBase* inp : inputs) {
+      if (inp->pre_op_) {
+        op->pre_ops_["X"].push_back(inp->pre_op_);
+        op->pre_ops_out_idx_["X"].push_back(inp->pre_op_out_idx_);
       } else {
-        op->pre_ops_[it.first].push_back(nullptr);
+        op->pre_ops_["X"].push_back(nullptr);
       }
-    }*/
+    }
+
+    auto& outputs = op->output_vars_["Out"];
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      VarBase* out = outputs[i];
+      out->stop_gradient_ = stop_gradient;
+      out->pre_op_ = op;
+      out->pre_op_out_name_ = "Out";
+      out->pre_op_out_idx_ = i;
+    }
+    if (!stop_gradient) {
+      auto& grad_input_vars = op->grad_input_vars_["X@GRAD"];
+      auto& grad_output_vars = op->grad_output_vars_["Out@GRAD"];
+
+      for (const VarBase* inp : inputs) {
+        grad_input_vars.push_back(inp->var_);
+      }
+      for (VarBase* out : outputs) {
+        grad_input_vars.push_back(out->var_);
+      }
+      for (VarBase* out : outputs) {
+        grad_input_vars.push_back(out->grads_);
+        if (!grad_input_vars.back()->IsInitialized()) {
+          InitVar(out->var_, grad_input_vars.back());
+        }
+      }
+      for (const VarBase* inp : inputs) {
+        grad_output_vars.push_back(inp->grads_);
+        if (!grad_output_vars.back()->IsInitialized()) {
+          InitVar(inp->var_, grad_output_vars.back());
+        }
+      }
+    }
     return outputs;
   }
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 455bcc6a41..93dd16c8c9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -175,6 +175,13 @@ PYBIND11_MODULE(core, m) {
           [](imperative::OpBase &self, int forward_id) {
             self.forward_id_ = forward_id;
           },
+          py::return_value_policy::reference)
+      .def_property(
+          "backward_id",
+          [](const imperative::OpBase &self) { return self.backward_id_; },
+          [](imperative::OpBase &self, int backward_id) {
+            self.backward_id_ = backward_id;
+          },
           py::return_value_policy::reference);
 
   py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
@@ -188,7 +195,7 @@ PYBIND11_MODULE(core, m) {
       .def(py::init<>())
       .def_static(
           "apply",
-          [](int func_id, const std::vector<imperative::VarBase> &inputs)
+          [](int func_id, const std::vector<imperative::VarBase *> &inputs)
               -> std::vector<imperative::VarBase *> {
                 return imperative::PyLayer::Apply(func_id, inputs);
               },
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 40ec312b69..2b224b8dbb 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -59,22 +59,23 @@ class PyLayer(core.PyLayer):
         raise NotImplementedError
 
     @staticmethod
-    def backward(inputs):
+    def backward(douts):
         raise NotImplementedError
 
     @classmethod
     def __call__(cls, inputs):
         tracer = framework._imperative_tracer()
         block = framework.default_main_program().current_block()
-        inputs = map(base.to_variable, inputs)
         inputs = [x._ivar for x in inputs]
 
         PyLayer.register_func(1, cls.forward)
+        PyLayer.register_func(2, cls.backward)
 
         iop = core.OpBase()
         iop.forward_id = 1
+        iop.backward_id = 2
         block.ops.append(iop)
-        ivars = tracer.py_trace(iop, inputs)
+        ivars = tracer.py_trace(iop, inputs, False)
         # ivars = core.PyLayer.apply(cls.forward, inputs)
         ret = []
         for ivar in ivars:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index 133e1e65c7..9f93ba9338 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -49,8 +49,18 @@ class MyPyLayer(fluid.imperative.PyLayer):
         return tuple([tensor])
 
     @staticmethod
-    def backward(douts, outs):
-        return np.array(douts[0]) * (1 - np.square(np.array(outs[0])))
+    def backward(inputs):
+        sys.stderr.write('calling into backward: %s\n' % str(inputs))
+        inp, out, dout = inputs
+        inp = np.array(inp)
+        out = np.array(out)
+        dout = np.array(dout)
+        sys.stderr.write('calling into backward: %s, %s, %s\n' %
+                         (inp, out, dout))
+        ret = np.array(dout) * (1 - np.square(np.array(out)))
+        tensor = core.LoDTensor()
+        tensor.set(ret, core.CPUPlace())
+        return tuple([tensor])
 
 
 class MLP(fluid.imperative.Layer):
@@ -71,20 +81,44 @@ class MLP(fluid.imperative.Layer):
 
 
 class TestImperative(unittest.TestCase):
+    """
     def test_layer(self):
         with fluid.imperative.guard():
             cl = core.Layer()
             cl.forward([])
             l = fluid.imperative.Layer()
             self.assertRaises(NotImplementedError, l.forward, [])
+    """
 
     def test_pylayer(self):
+        np_inp = np.ones([2, 2], np.float32)
         with fluid.imperative.guard():
             my_py_layer = MyPyLayer()
-            outs = my_py_layer([np.ones([2, 2], np.float32)])
-            sys.stderr.write('%s\n' % outs[0]._numpy())
-            # out.backward()
+            var_inp = fluid.imperative.base.to_variable(np_inp)
+            outs = my_py_layer([var_inp])
+            dy_out = np.sum(outs[0]._numpy())
+            outs[0]._backward()
+            dy_grad = var_inp._gradient()
+
+        with new_program_scope():
+            inp = fluid.layers.data(
+                name="inp", shape=[2, 2], append_batch_size=False)
+            # TODO(panyx0718): Paddle doesn't diff against data `inp`.
+            x1 = inp * 1
+            # TODO(panyx0718): If reduce_sum is skipped, the result is wrong.
+            x = fluid.layers.reduce_sum(fluid.layers.tanh(x1))
+            param_grads = fluid.backward.append_backward(
+                x, parameter_list=[x1.name])[0]
+            exe = fluid.Executor(fluid.CPUPlace())
+
+            static_out, static_grad = exe.run(
+                feed={inp.name: np_inp},
+                fetch_list=[x.name, param_grads[1].name])
+
+        self.assertTrue(np.allclose(dy_out, static_out))
+        self.assertTrue(np.allclose(dy_grad, static_grad))
 
+    """
     def test_layer_in_out(self):
         np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.imperative.guard():
@@ -138,6 +172,7 @@ class TestImperative(unittest.TestCase):
 
         self.assertTrue(np.allclose(dy_out, static_out))
         self.assertTrue(np.allclose(dy_grad, static_grad))
+    """
 
 
 if __name__ == '__main__':

From 9597fd05e9b1f669ad6102d67069f1c04e8840f8 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 9 Jan 2019 15:42:05 +0800
Subject: [PATCH 16/22] polish

test=develop
---
 paddle/fluid/imperative/layer.cc              | 90 ++++++++++++-------
 paddle/fluid/imperative/layer.h               | 87 +++++-------------
 paddle/fluid/imperative/tracer.h              |  5 +-
 paddle/fluid/pybind/pybind.cc                 | 10 ++-
 python/paddle/fluid/imperative/layers.py      | 11 ++-
 .../fluid/tests/unittests/test_imperative.py  | 44 ++++++++-
 6 files changed, 136 insertions(+), 111 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 131e3e1bd5..d0aaa00c49 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -128,26 +128,23 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
     return {};
   }
 
-  std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
   std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
-  for (auto it : grad_output_vars_) {
-    auto& outputs = grad_outputs[it.first];
-    for (size_t i = 0; i < it.second.size(); ++i) {
-      // Allocate a new variable
-      Variable* tmp_var = new framework::Variable();
-      tmp_var->GetMutable<framework::LoDTensor>();
-
-      tmp_vars.emplace_back(tmp_var);
-      outputs.push_back(tmp_var);
-    }
-  }
-
   if (backward_id_ > 0) {
     VLOG(3) << "py_layer_grad";
-    PyLayer::ApplyGrad(backward_id_, grad_input_vars_["X@GRAD"],
-                       &(grad_outputs["Out@GRAD"]));
+    grad_outputs["Out@GRAD"] =
+        PyLayer::ApplyGrad(backward_id_, grad_input_vars_["X@GRAD"]);
   } else {
     VLOG(3) << "op grad " << grad_op_desc_->Type();
+    for (auto it : grad_output_vars_) {
+      auto& outputs = grad_outputs[it.first];
+      for (size_t i = 0; i < it.second.size(); ++i) {
+        // Allocate a new variable
+        Variable* tmp_var = new framework::Variable();
+        tmp_var->GetMutable<framework::LoDTensor>();
+        outputs.push_back(tmp_var);
+      }
+    }
+
     framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
 
     // No need to do compile time infer shape here.
@@ -170,10 +167,13 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
   for (auto it : grad_output_vars_) {
     auto& outputs = grad_outputs[it.first];
     auto& origin_outputs = it.second;
+    PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
 
     for (size_t i = 0; i < outputs.size(); ++i) {
+      framework::Variable* grad = outputs[i];
       framework::Variable* orig_grad = origin_outputs[i];
-      AddTo(outputs[i], orig_grad);
+      AddTo(grad, orig_grad);
+      delete grad;
     }
   }
   return input_vars_;
@@ -197,30 +197,60 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
   py_funcs_[func_id] = py_func;
 }
 
+int PyLayer::NumFuncs() { return py_funcs_.size(); }
+
 std::vector<VarBase*> PyLayer::Apply(int func_id,
                                      const std::vector<VarBase*>& inputs) {
-  std::vector<framework::LoDTensor> tensor_inputs;
-  std::vector<VarBase*> ret;
-
+  std::vector<framework::Variable*> invars;
   for (const VarBase* in : inputs) {
-    tensor_inputs.push_back(in->var_->Get<framework::LoDTensor>());
+    invars.push_back(in->var_);
   }
   PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
-  CallPythonFunc(py_funcs_[func_id], tensor_inputs, &ret);
+  std::vector<Variable*> outvars = CallPythonFunc(py_funcs_[func_id], invars);
+  std::vector<VarBase*> ret;
+  for (Variable* v : outvars) {
+    ret.push_back(new VarBase(v, new Variable()));
+  }
   return ret;
 }
 
-void PyLayer::ApplyGrad(int func_id,
-                        const std::vector<framework::Variable*>& inputs,
-                        std::vector<framework::Variable*>* outputs) {
-  std::vector<framework::LoDTensor> tensor_inputs;
-  std::vector<VarBase*> ret;
+std::vector<Variable*> PyLayer::ApplyGrad(
+    int func_id, const std::vector<framework::Variable*>& inputs) {
+  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
+  return CallPythonFunc(py_funcs_[func_id], inputs);
+}
 
-  for (const Variable* in : inputs) {
-    tensor_inputs.push_back(in->Get<framework::LoDTensor>());
+std::vector<framework::Variable*> PyLayer::CallPythonFunc(
+    const py::object& callable, const std::vector<framework::Variable*>& ins) {
+  py::gil_scoped_acquire guard;
+  py::tuple in_args(ins.size());
+  for (size_t i = 0; i < ins.size(); ++i) {
+    const framework::LoDTensor& t = ins[i]->Get<framework::LoDTensor>();
+    in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
   }
-  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
-  CallPythonFunc(py_funcs_[func_id], tensor_inputs, outputs);
+  VLOG(3) << "pyfunc in " << py::len(in_args);
+
+  // TODO(panyx0718): Who owns the returned LoDTensor.
+  auto ret = callable(in_args);
+  auto ret_tuple = py::cast<py::tuple>(ret);
+  size_t ret_num = py::len(ret_tuple);
+  std::vector<framework::Variable*> outs;
+  VLOG(3) << "pyfunc out " << ret_num;
+  for (size_t i = 0; i < ret_num; ++i) {
+    try {
+      auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
+      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
+                              "Output tensor %d should not be nullptr", i);
+      auto* var = new framework::Variable();
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      tensor->ShareDataWith(*py_out_tensor);
+      tensor->set_lod(py_out_tensor->lod());
+      outs.push_back(var);
+    } catch (py::cast_error&) {
+      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+    }
+  }
+  return outs;
 }
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 84e04cb74e..4be0614c7e 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -87,12 +87,15 @@ class OpBase;
 
 class VarBase {
  public:
-  VarBase()
+  VarBase() : VarBase(new framework::Variable(), new framework::Variable()) {}
+
+  // Owns `var` and `grad`
+  VarBase(framework::Variable* var, framework::Variable* grad)
       : pre_op_(nullptr),
         pre_op_out_idx_(-1),
         var_desc_(nullptr),
-        var_(new framework::Variable()),
-        grads_(new framework::Variable()),
+        var_(var),
+        grads_(grad),
         stop_gradient_(false) {}
 
   explicit VarBase(bool stop_gradient)
@@ -131,8 +134,8 @@ class OpBase {
  public:
   OpBase()
       : op_desc_(nullptr),
-        grad_op_desc_(nullptr),
         forward_id_(-1),
+        grad_op_desc_(nullptr),
         backward_id_(-1) {}
 
   virtual ~OpBase() {
@@ -141,10 +144,13 @@ class OpBase {
 
   std::map<std::string, std::vector<VarBase*>> ApplyGrad();
 
+  // One of `op_desc_` or `forward_id_` is set, not both.
+  // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
   framework::OpDesc* op_desc_;
-  framework::OpDesc* grad_op_desc_;
-
   int forward_id_;
+  // When has backward, one of `grad_op_desc_` or `backward_id_` is set,
+  // not both.
+  framework::OpDesc* grad_op_desc_;
   int backward_id_;
 
   std::map<std::string, std::vector<VarBase*>> input_vars_;
@@ -167,76 +173,23 @@ class Layer {
   }
 };
 
-static void CallPythonFunc(const py::object& callable,
-                           const std::vector<framework::LoDTensor>& ins,
-                           std::vector<VarBase*>* outs) {
-  py::gil_scoped_acquire guard;
-  py::tuple in_args(ins.size());
-  for (size_t i = 0; i < ins.size(); ++i) {
-    in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr);
-  }
-
-  // TODO(panyx0718): Who owns the returned LoDTensor.
-  auto ret = callable(in_args);
-  auto ret_tuple = py::cast<py::tuple>(ret);
-  size_t ret_num = py::len(ret_tuple);
-  for (size_t i = 0; i < ret_num; ++i) {
-    try {
-      auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
-      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
-                              "Output tensor %d should not be nullptr", i);
-      VarBase* var = new VarBase();
-      auto* tensor = var->var_->GetMutable<framework::LoDTensor>();
-      tensor->ShareDataWith(*py_out_tensor);
-      tensor->set_lod(py_out_tensor->lod());
-      outs->push_back(var);
-    } catch (py::cast_error&) {
-      PADDLE_THROW("The %d-th output must be LoDTensor", i);
-    }
-  }
-}
-
-static void CallPythonFunc(const py::object& callable,
-                           const std::vector<framework::LoDTensor>& ins,
-                           std::vector<framework::Variable*>* outs) {
-  py::gil_scoped_acquire guard;
-  py::tuple in_args(ins.size());
-  for (size_t i = 0; i < ins.size(); ++i) {
-    in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr);
-  }
-  VLOG(3) << "pyfunc in " << py::len(in_args);
-
-  // TODO(panyx0718): Who owns the returned LoDTensor.
-  auto ret = callable(in_args);
-  auto ret_tuple = py::cast<py::tuple>(ret);
-  size_t ret_num = py::len(ret_tuple);
-  VLOG(3) << "pyfunc out " << ret_num;
-  for (size_t i = 0; i < ret_num; ++i) {
-    try {
-      auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
-      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
-                              "Output tensor %d should not be nullptr", i);
-      auto* tensor = (*outs)[i]->GetMutable<framework::LoDTensor>();
-      tensor->ShareDataWith(*py_out_tensor);
-      tensor->set_lod(py_out_tensor->lod());
-    } catch (py::cast_error&) {
-      PADDLE_THROW("The %d-th output must be LoDTensor", i);
-    }
-  }
-}
-
 class PyLayer {
  public:
   virtual ~PyLayer() {}
 
   static void RegisterFunc(int func_id, const py::object& py_func);
 
+  static int NumFuncs();
+
   static std::vector<VarBase*> Apply(int func_id,
                                      const std::vector<VarBase*>& inputs);
 
-  static void ApplyGrad(int func_id,
-                        const std::vector<framework::Variable*>& inputs,
-                        std::vector<framework::Variable*>* outputs);
+  static std::vector<framework::Variable*> ApplyGrad(
+      int func_id, const std::vector<framework::Variable*>& inputs);
+
+ private:
+  static std::vector<framework::Variable*> CallPythonFunc(
+      const py::object& callable, const std::vector<framework::Variable*>& ins);
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index f6aebea9bb..f68a67e5d7 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -132,8 +132,9 @@ class Tracer {
     if (!stop_gradient) {
       framework::OpDesc* grad_op_desc;
       // TODO(panyx): Is this leaked?
-      auto grad_to_var = new std::unordered_map<std::string, std::string>();
-      CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
+      std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
+          new std::unordered_map<std::string, std::string>());
+      CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get());
       op->grad_op_desc_ = grad_op_desc;
 
       for (auto it : grad_op_desc->Inputs()) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 93dd16c8c9..e52401ced7 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -191,7 +191,7 @@ PYBIND11_MODULE(core, m) {
         return self.Forward(inputs);
       });
 
-  py::class_<paddle::imperative::PyLayer>(m, "PyLayer")
+  py::class_<imperative::PyLayer>(m, "PyLayer")
       .def(py::init<>())
       .def_static(
           "apply",
@@ -200,9 +200,11 @@ PYBIND11_MODULE(core, m) {
                 return imperative::PyLayer::Apply(func_id, inputs);
               },
           py::return_value_policy::take_ownership)
-      .def_static("register_func", [](int func_id, const py::object &callable) {
-        imperative::PyLayer::RegisterFunc(func_id, callable);
-      });
+      .def_static("register_func",
+                  [](int func_id, const py::object &callable) {
+                    imperative::PyLayer::RegisterFunc(func_id, callable);
+                  })
+      .def_static("num_funcs", &imperative::PyLayer::NumFuncs);
 
   BindTracer(&m);
 
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 2b224b8dbb..8027d9ba3b 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -68,12 +68,15 @@ class PyLayer(core.PyLayer):
         block = framework.default_main_program().current_block()
         inputs = [x._ivar for x in inputs]
 
-        PyLayer.register_func(1, cls.forward)
-        PyLayer.register_func(2, cls.backward)
+        if not hasattr(cls, 'forward_id'):
+            cls.forward_id = core.PyLayer.num_funcs() + 1
+            PyLayer.register_func(cls.forward_id, cls.forward)
+            cls.backward_id = core.PyLayer.num_funcs() + 1
+            PyLayer.register_func(cls.backward_id, cls.backward)
 
         iop = core.OpBase()
-        iop.forward_id = 1
-        iop.backward_id = 2
+        iop.forward_id = cls.forward_id
+        iop.backward_id = cls.backward_id
         block.ops.append(iop)
         ivars = tracer.py_trace(iop, inputs, False)
         # ivars = core.PyLayer.apply(cls.forward, inputs)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index 9f93ba9338..e3e1ce7ca3 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -81,14 +81,52 @@ class MLP(fluid.imperative.Layer):
 
 
 class TestImperative(unittest.TestCase):
-    """
     def test_layer(self):
         with fluid.imperative.guard():
             cl = core.Layer()
             cl.forward([])
             l = fluid.imperative.Layer()
             self.assertRaises(NotImplementedError, l.forward, [])
-    """
+
+    def test_pylayer_func_id(self):
+
+        with fluid.imperative.guard():
+
+            class PyLayer1(fluid.imperative.PyLayer):
+                def __init__(self):
+                    super(PyLayer1, self).__init__()
+
+                @staticmethod
+                def forward(inputs):
+                    return inputs
+
+                @staticmethod
+                def backward(inputs):
+                    return inputs
+
+            class PyLayer2(fluid.imperative.PyLayer):
+                def __init__(self):
+                    super(PyLayer2, self).__init__()
+
+                @staticmethod
+                def forward(inputs):
+                    return inputs
+
+                @staticmethod
+                def backward(inputs):
+                    return inputs
+
+            py_layer_1 = PyLayer1()
+            py_layer_2 = PyLayer2()
+            py_layer_1([fluid.imperative.base.to_variable(np.ones([2, 2]))])
+            py_layer_2([fluid.imperative.base.to_variable(np.ones([2, 2]))])
+            id = py_layer_1.forward_id
+            self.assertGreater(id, 0)
+            self.assertEqual(py_layer_1.backward_id, id + 1)
+            self.assertEqual(py_layer_2.forward_id, id + 2)
+            self.assertEqual(py_layer_2.backward_id, id + 3)
+            py_layer_1([fluid.imperative.base.to_variable(np.ones([2, 2]))])
+            self.assertEqual(py_layer_1.forward_id, id)
 
     def test_pylayer(self):
         np_inp = np.ones([2, 2], np.float32)
@@ -118,7 +156,6 @@ class TestImperative(unittest.TestCase):
         self.assertTrue(np.allclose(dy_out, static_out))
         self.assertTrue(np.allclose(dy_grad, static_grad))
 
-    """
     def test_layer_in_out(self):
         np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.imperative.guard():
@@ -172,7 +209,6 @@ class TestImperative(unittest.TestCase):
 
         self.assertTrue(np.allclose(dy_out, static_out))
         self.assertTrue(np.allclose(dy_grad, static_grad))
-    """
 
 
 if __name__ == '__main__':

From 439691f5bd03337f0d042fd81dca15fc7abcbd8f Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Thu, 10 Jan 2019 16:37:43 +0800
Subject: [PATCH 17/22] adjust the shlwapi on windows test=develop

---
 cmake/external/gflags.cmake                   |  9 ++++++++
 cmake/generic.cmake                           | 22 +++++++++----------
 .../inference/api/demo_ci/CMakeLists.txt      |  4 ++--
 paddle/fluid/inference/utils/CMakeLists.txt   |  3 ---
 paddle/fluid/platform/float16.h               |  2 +-
 paddle/fluid/pybind/CMakeLists.txt            |  5 ++---
 6 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 4e98e4bf88..95ca16f57f 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags)
 
 LIST(APPEND external_project_dependencies gflags)
 
+# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
+if (WIN32)
+  include(CheckIncludeFileCXX)
+  check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
+  if (HAVE_SHLWAPI)
+    set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
+  endif(HAVE_SHLWAPI)
+endif (WIN32)
+
 IF(WITH_C_API)
   INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
   IF(ANDROID)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 81a910100a..63820fd4f0 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
     common_link(${TARGET_NAME})
   endif()
+  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+  target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
 endfunction(cc_binary)
 
 function(cc_test TARGET_NAME)
@@ -367,18 +369,15 @@ function(cc_test TARGET_NAME)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${cc_test_SRCS})
     if(WIN32)
-      list(APPEND win32_deps shlwapi)
       if("${cc_test_DEPS};" MATCHES "python;")
         list(REMOVE_ITEM cc_test_DEPS python)
-        list(APPEND win32_deps ${PYTHON_LIBRARIES})
+        target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
       endif()
     endif(WIN32)
-    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
-    if(WIN32)
-      target_link_libraries(${TARGET_NAME} ${win32_deps})
-    endif(WIN32)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     common_link(${TARGET_NAME})
     add_test(NAME ${TARGET_NAME}
@@ -451,10 +450,8 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
-    if(WIN32)
-      target_link_libraries(${TARGET_NAME} shlwapi)
-    endif(WIN32)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
@@ -541,7 +538,8 @@ function(hip_test TARGET_NAME)
     endif()
     add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
     set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
-    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags ${os_dependency_modules})
     add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index f42ee9a697..aaf137601c 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -128,8 +128,8 @@ else()
       ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
       ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash
       snappystream ${EXTERNAL_LIB})
-  # NOTE(dzhwinter) shlwapi is deprecated.
-  set(DEPS ${DEPS} libcmt shlwapi)
+  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+  set(DEPS ${DEPS} libcmt ${os_dependency_modules})
 endif(NOT WIN32)
 
 if(WITH_GPU)
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index cfb80fe6ec..c43eaf7f98 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
 cc_binary(visualizer SRCS visualizer.cc DEPS analysis
     paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
-if(WIN32)
-  target_link_libraries(visualizer shlwapi)
-endif(WIN32)
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index 98afe843c0..c203f4e04a 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -59,7 +59,7 @@ limitations under the License. */
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
-#define PADDLE_ALIGN(x) /*do nothing*/
+#define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 2545f5312f..ca2764e64f 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -21,9 +21,8 @@ if(WITH_PYTHON)
     endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
   endif(WITH_AMD_GPU)
 
-  if(WIN32)
-    target_link_libraries(paddle_pybind shlwapi)
-  endif(WIN32)
+  get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+  target_link_libraries(paddle_pybind ${os_dependency_modules})
 
   cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
 endif(WITH_PYTHON)

From c3a9f3c4b2c3465f8779cbed4ec8f86479647bde Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 10 Jan 2019 14:06:35 +0000
Subject: [PATCH 18/22] fix typo and refine

test=develop
---
 paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc | 2 +-
 paddle/fluid/operators/jit/benchmark.cc               | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
index 96a3b7ee05..fa75e3b4aa 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
@@ -50,7 +50,7 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
       // the other one should be unused empty var.
       if (is_nth_input_var_of_concat(x->outputs[0], idx)) {
         satisfied_all = satisfied_all && x->outputs[1]->IsVar() &&
-                        x->outputs[1]->outputs.size() == 0;
+                        x->outputs[1]->outputs.empty();
       } else {
         satisfied_all =
             satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) &&
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index bde2791add..4b4ce07fa7 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -52,11 +52,11 @@ struct BenchFunc {
     for (int i = 0; i < FLAGS_burning; ++i) {
       tgt(args...);
     }
-    auto start = paddle::platform::PosixInNsec() / 1e-3;
+    auto start = paddle::platform::PosixInNsec() * 1e-3;
     for (int i = 0; i < FLAGS_repeat; ++i) {
       tgt(args...);
     }
-    auto end = paddle::platform::PosixInNsec() / 1e-3;
+    auto end = paddle::platform::PosixInNsec() * 1e-3;
     return static_cast<double>(end - start) / FLAGS_repeat;
   }
 };

From 3472f6735694bc6c4ccefff4ba1fc2220dd276e0 Mon Sep 17 00:00:00 2001
From: lujun <lujun315023@126.com>
Date: Fri, 11 Jan 2019 10:41:19 +0800
Subject: [PATCH 19/22] fix mnist-dataset bug at windows,test=develop

---
 python/paddle/dataset/mnist.py | 91 ++++++++++++++++------------------
 1 file changed, 43 insertions(+), 48 deletions(-)

diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 38addd0cfd..847ca18720 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -21,10 +21,9 @@ parse training set and test set into paddle reader creators.
 from __future__ import print_function
 
 import paddle.dataset.common
-import subprocess
+import gzip
 import numpy
-import platform
-import tempfile
+import struct
 from six.moves import range
 __all__ = ['train', 'test', 'convert']
 
@@ -41,51 +40,47 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
 
 def reader_creator(image_filename, label_filename, buffer_size):
     def reader():
-        if platform.system() == 'Darwin':
-            zcat_cmd = 'gzcat'
-        elif platform.system() == 'Linux':
-            zcat_cmd = 'zcat'
-        else:
-            raise NotImplementedError()
-
-        # According to http://stackoverflow.com/a/38061619/724872, we
-        # cannot use standard package gzip here.
-        tmp_image_file = tempfile.TemporaryFile(prefix='paddle_dataset')
-        m = subprocess.Popen(
-            [zcat_cmd, image_filename], stdout=tmp_image_file).communicate()
-        tmp_image_file.seek(16)  # skip some magic bytes
-
-        # Python3 will not take stdout as file
-        tmp_label_file = tempfile.TemporaryFile(prefix='paddle_dataset')
-        l = subprocess.Popen(
-            [zcat_cmd, label_filename], stdout=tmp_label_file).communicate()
-        tmp_label_file.seek(8)  # skip some magic bytes
-
-        try:  # reader could be break.
-            while True:
-                labels = numpy.fromfile(
-                    tmp_label_file, 'ubyte', count=buffer_size).astype("int")
-
-                if labels.size != buffer_size:
-                    break  # numpy.fromfile returns empty slice after EOF.
-
-                images = numpy.fromfile(
-                    tmp_image_file, 'ubyte', count=buffer_size * 28 *
-                    28).reshape((buffer_size, 28 * 28)).astype('float32')
-
-                images = images / 255.0 * 2.0 - 1.0
-
-                for i in range(buffer_size):
-                    yield images[i, :], int(labels[i])
-        finally:
-            try:
-                m.terminate()
-            except:
-                pass
-            try:
-                l.terminate()
-            except:
-                pass
+        with gzip.GzipFile(image_filename, 'rb') as image_file:
+            img_buf = image_file.read()
+            with gzip.GzipFile(label_filename, 'rb') as label_file:
+                lab_buf = label_file.read()
+
+                step_label = 0
+
+                offset_img = 0
+                # read from Big-endian
+                # get file info from magic byte
+                # image file : 16B
+                magic_byte_img = '>IIII'
+                magic_img, image_num, rows, cols = struct.unpack_from(
+                    magic_byte_img, img_buf, offset_img)
+                offset_img += struct.calcsize(magic_byte_img)
+
+                offset_lab = 0
+                # label file : 8B
+                magic_byte_lab = '>II'
+                magic_lab, label_num = struct.unpack_from(magic_byte_lab,
+                                                          lab_buf, offset_lab)
+                offset_lab += struct.calcsize(magic_byte_lab)
+
+                while True:
+                    if step_label >= label_num:
+                        break
+                    fmt_label = '>' + str(buffer_size) + 'B'
+                    labels = struct.unpack_from(fmt_label, lab_buf, offset_lab)
+                    offset_lab += struct.calcsize(fmt_label)
+                    step_label += buffer_size
+
+                    fmt_images = '>' + str(buffer_size * rows * cols) + 'B'
+                    images_temp = struct.unpack_from(fmt_images, img_buf,
+                                                     offset_img)
+                    images = numpy.reshape(images_temp, (
+                        buffer_size, rows * cols)).astype('float32')
+                    offset_img += struct.calcsize(fmt_images)
+
+                    images = images / 255.0 * 2.0 - 1.0
+                    for i in range(buffer_size):
+                        yield images[i, :], int(labels[i])
 
     return reader
 

From 358e657f68cfc4a2abc84ca4b1c46480ef09b171 Mon Sep 17 00:00:00 2001
From: chengduozh <zhaochengduo@baidu.com>
Date: Fri, 11 Jan 2019 12:45:47 +0800
Subject: [PATCH 20/22] Revert "Remove workspace_handle in conv_cudnn (#15186)"
 test=develop This reverts commit 064512aa47b9ea35c0b5479b32c1653512c8b7c4.

---
 paddle/fluid/framework/operator.h             |   2 +-
 paddle/fluid/operators/conv_cudnn_op.cu.cc    | 149 ++++++++----------
 paddle/fluid/platform/device_context.cc       |  28 ++--
 paddle/fluid/platform/device_context.h        |   2 +-
 paddle/fluid/platform/temporary_allocator.cc  |  63 ++------
 paddle/fluid/platform/temporary_allocator.h   |  10 +-
 .../platform/temporary_allocator_test.cc      |  58 +------
 python/paddle/fluid/__init__.py               |   3 +-
 8 files changed, 107 insertions(+), 208 deletions(-)

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 041187665a..4d29564aee 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -391,7 +391,7 @@ class ExecutionContext {
     PADDLE_ENFORCE(
         dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
         "The AllocationPtr must be TemporaryAllocation.");
-    PADDLE_ENFORCE_GE(allocation_ptr->size(),
+    PADDLE_ENFORCE_EQ(allocation_ptr->size(),
                       framework::product(dim) * sizeof(T));
 
     paddle::framework::Tensor temp_tensor(
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index f5208e7a60..25a723fc07 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -137,6 +137,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionFwdAlgo_t algo;
     auto handle = dev_ctx.cudnn_handle();
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
     bool half_float = false;
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
@@ -157,8 +158,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
       VLOG(5) << "NOT use cudnn_tensor_op_math";
     }
 #endif
-    Tensor cudnn_workspace;
-    void* cudnn_workspace_ptr = nullptr;
 
     auto x_dims = framework::vectorize(input->dims());
     auto f_dims = framework::vectorize(filter->dims());
@@ -181,26 +180,21 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
                 .Var(kCUDNNFwdAlgoCache)
                 ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
       }
-      cudnn_workspace =
-          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
-              framework::make_ddim(
-                  {static_cast<int64_t>(workspace_size_limit)}),
-              dev_ctx);
-      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
-
       algo = algo_cache->GetAlgorithm(
           x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
             int returned_algo_count;
             std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
                 fwd_perf_stat;
-
-            CUDNN_ENFORCE(
-                platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                    handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-                    filter_data, cudnn_conv_desc, cudnn_output_desc,
-                    output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                    fwd_perf_stat.data(), cudnn_workspace_ptr,
-                    workspace_size_limit));
+            auto cudnn_find_func = [&](void* cudnn_workspace) {
+              CUDNN_ENFORCE(
+                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+                      handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+                      filter_data, cudnn_conv_desc, cudnn_output_desc,
+                      output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                      fwd_perf_stat.data(), cudnn_workspace,
+                      workspace_size_limit));
+            };
+            workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
 
             VLOG(3) << "Perf result: (algo: stat, time, memory)";
             for (int i = 0; i < returned_algo_count; ++i) {
@@ -225,23 +219,17 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                       "workspace_size to be allocated exceeds the limit");
 
-    // Allocate on GPU memory
-    if (!cudnn_workspace_ptr) {
-      cudnn_workspace =
-          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
-              framework::make_ddim(
-                  {static_cast<int64_t>(workspace_size_in_bytes)}),
-              dev_ctx);
-      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
-    }
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-          handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
-          cudnn_filter_desc, filter_data + i * group_offset_filter,
-          cudnn_conv_desc, algo, cudnn_workspace_ptr, workspace_size_in_bytes,
-          &beta, cudnn_output_desc, output_data + i * group_offset_out));
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+            cudnn_filter_desc, filter_data + i * group_offset_filter,
+            cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
+            &beta, cudnn_output_desc, output_data + i * group_offset_out));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
     }
   }
 };
@@ -365,20 +353,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       workspace_size_limit = max_user_size * 1024 * 1024;
     }
 
-    Tensor cudnn_workspace;
-    void* cudnn_workspace_ptr = nullptr;
-    if ((input_data || filter_data) && exhaustive_search) {
-      cudnn_workspace =
-          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
-              framework::make_ddim(
-                  {static_cast<int64_t>(workspace_size_limit)}),
-              dev_ctx);
-      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
-    }
-
     auto x_dims = framework::vectorize(input->dims());
     auto f_dims = framework::vectorize(filter->dims());
     auto handle = dev_ctx.cudnn_handle();
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       if (exhaustive_search) {
@@ -396,22 +374,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                   ->GetMutable<
                       AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
         }
-
         data_algo = data_algo_cache->GetAlgorithm(
             x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
               int returned_algo_count;
               std::array<cudnnConvolutionBwdDataAlgoPerf_t,
                          kNUM_CUDNN_BWD_DATA_ALGS>
                   data_perf_stat;
-
-              CUDNN_ENFORCE(platform::dynload::
-                                cudnnFindConvolutionBackwardDataAlgorithmEx(
-                                    handle, cudnn_filter_desc, filter_data,
-                                    cudnn_output_grad_desc, output_grad_data,
-                                    cudnn_conv_desc, cudnn_input_desc,
-                                    input_grad_data, kNUM_CUDNN_BWD_DATA_ALGS,
-                                    &returned_algo_count, data_perf_stat.data(),
-                                    cudnn_workspace_ptr, workspace_size_limit));
+              auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) {
+                CUDNN_ENFORCE(
+                    platform::dynload::
+                        cudnnFindConvolutionBackwardDataAlgorithmEx(
+                            handle, cudnn_filter_desc, filter_data,
+                            cudnn_output_grad_desc, output_grad_data,
+                            cudnn_conv_desc, cudnn_input_desc, input_grad_data,
+                            kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
+                            data_perf_stat.data(), cudnn_workspace,
+                            workspace_size_limit));
+              };
+              workspace_handle.RunFunc(cudnn_find_bd_data_func,
+                                       workspace_size_limit);
 
               VLOG(3) << "Perf result: (algo: stat, time, memory)";
               for (int i = 0; i < returned_algo_count; ++i) {
@@ -462,23 +443,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                   ->GetMutable<
                       AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
         }
-
         filter_algo = f_algo_cache->GetAlgorithm(
             x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
               int returned_algo_count;
               std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
                          kNUM_CUDNN_BWD_FILTER_ALGS>
                   filter_perf_stat;
-
-              CUDNN_ENFORCE(
-                  platform::dynload::
-                      cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                          handle, cudnn_input_desc, input_data,
-                          cudnn_output_grad_desc, output_grad_data,
-                          cudnn_conv_desc, cudnn_filter_desc, filter_grad_data,
-                          kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
-                          filter_perf_stat.data(), cudnn_workspace_ptr,
-                          workspace_size_limit));
+              auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) {
+                CUDNN_ENFORCE(
+                    platform::dynload::
+                        cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                            handle, cudnn_input_desc, input_data,
+                            cudnn_output_grad_desc, output_grad_data,
+                            cudnn_conv_desc, cudnn_filter_desc,
+                            filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
+                            &returned_algo_count, filter_perf_stat.data(),
+                            cudnn_workspace, workspace_size_limit));
+              };
+              workspace_handle.RunFunc(cudnn_find_bd_f_func,
+                                       workspace_size_limit);
               return filter_perf_stat[0].algo;
             });
         VLOG(3) << "cuDNN backward filter algo " << filter_algo;
@@ -499,16 +482,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
 
-    // ------------------- cudnn conv workspace ---------------------
-    if (!cudnn_workspace_ptr) {
-      cudnn_workspace =
-          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
-              framework::make_ddim(
-                  {static_cast<int64_t>(workspace_size_in_bytes)}),
-              dev_ctx);
-      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
-    }
-
     // ------------------- cudnn conv backward data ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
@@ -516,12 +489,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset input_grad.
 
       for (int i = 0; i < groups; i++) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-            handle, &alpha, cudnn_filter_desc,
-            filter_data + i * group_offset_filter, cudnn_output_grad_desc,
-            output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
-            cudnn_workspace_ptr, workspace_size_in_bytes, &beta,
-            cudnn_input_desc, input_grad_data + i * group_offset_in));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+              handle, &alpha, cudnn_filter_desc,
+              filter_data + i * group_offset_filter, cudnn_output_grad_desc,
+              output_grad_data + i * group_offset_out, cudnn_conv_desc,
+              data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+              cudnn_input_desc, input_grad_data + i * group_offset_in));
+        };
+        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
       }
     }
     // ------------------- cudnn conv backward filter ---------------------
@@ -529,12 +505,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
       for (int i = 0; i < groups; i++) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
-            cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
-            cudnn_conv_desc, filter_algo, cudnn_workspace_ptr,
-            workspace_size_in_bytes, &beta, cudnn_filter_desc,
-            filter_grad_data + i * group_offset_filter));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+              handle, &alpha, cudnn_input_desc,
+              input_data + i * group_offset_in, cudnn_output_grad_desc,
+              output_grad_data + i * group_offset_out, cudnn_conv_desc,
+              filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+              cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
+        };
+        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
       }
     }
   }
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 8f80a2d782..09f3d3de54 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -92,24 +92,26 @@ platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
     const platform::Place& place, const cudaStream_t& stream) {
   PADDLE_ENFORCE(platform::is_gpu_place(place));
   auto place_stream = std::make_pair(place, stream);
-  std::unique_lock<std::mutex> lock(mtx_);
-  auto it = device_allocator_.find(place_stream);
-  if (it == device_allocator_.end()) {
-    auto tmp_allocator = new TemporaryAllocator(place);
-    tmp_allocator->SetCallback([stream]() {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-      PADDLE_ENFORCE(cudaGetLastError());
-    });
-    device_allocator_[place_stream].reset(tmp_allocator);
-    return *tmp_allocator;
-  } else {
-    return *it->second;
+  {
+    std::unique_lock<std::mutex> lock(mtx_);
+    if (!device_allocator_.count(place_stream)) {
+      device_allocator_[place_stream].reset(new TemporaryAllocator(place));
+      device_allocator_[place_stream]->SetCallback([stream]() {
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+        PADDLE_ENFORCE(cudaGetLastError());
+      });
+    }
   }
+  return *device_allocator_.at(place_stream);
 }
 
 template <>
 platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
     const platform::CUDADeviceContext& dev_ctx) {
+  auto place_stream = std::make_pair(dev_ctx.GetPlace(), dev_ctx.stream());
+  if (device_allocator_.count(place_stream)) {
+    return *device_allocator_.at(place_stream);
+  }
   return Get(dev_ctx.GetPlace(), dev_ctx.stream());
 }
 #endif
@@ -323,7 +325,7 @@ Place CUDADeviceContext::GetPlace() const { return place_; }
 void CUDADeviceContext::Wait() const {
   auto& allocator =
       DeviceTemporaryAllocator::Instance().Get<CUDADeviceContext>(*this);
-  allocator.Release([this]() {
+  allocator.Release([=]() {
     PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
     PADDLE_ENFORCE(cudaGetLastError());
   });
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index d376f90ad5..c81d17380c 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -61,7 +61,7 @@ namespace platform {
  * the allocations of temp_allocation_queue:
  *  - when the Stream calls cudaStreamSynchronize;
  *  - when the allocation size of opportunities exceeds a certain threshold
- *    (defined by FLAGS_limit_of_tmp_allocation).
+ *    (defined by FLAGS_limit_of_temporary_allocation).
  *
  * */
 class DeviceTemporaryAllocator {
diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
index 9cbdfe46e7..0be017f75b 100644
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -15,15 +15,8 @@
 #include "paddle/fluid/platform/temporary_allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 
-DEFINE_int64(limit_of_tmp_allocation, -1,
-             "The up limit of temporary_allocation size.");
-DEFINE_double(times_excess_than_required_tmp_allocation, 2,
-              "times_excess_than_required_tmp_allocation indicates the "
-              "max size the TemporaryAllocator can return. For example, "
-              "if the required memory size is N, and "
-              "times_excess_than_required_tmp_allocation is 2.0, "
-              "the TemporaryAllocator will return the available allocation "
-              "that the range of size is N ~ 2*N.");
+DEFINE_double(limit_of_temporary_allocation, -1,
+              "The up limit of temporary_allocation size.");
 
 namespace paddle {
 namespace platform {
@@ -36,25 +29,24 @@ TemporaryAllocation::TemporaryAllocation(
       underlying_allocation_(std::move(underlying_allocation)) {}
 
 TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
-  temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
+  temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
 }
 
 bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
 
 void TemporaryAllocator::Release(const std::function<void()> &callback) {
-  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations;
+  std::shared_ptr<std::deque<TemporaryAllocation *>> t_allocations;
   {
     std::unique_lock<std::mutex> lock(mtx_);
     callback();
-    t_allocations.swap(temp_mem_map_);
-    temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
+    t_allocations = temp_mem_queue_;
+    temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
     wait_delete_mem_ = 0;
   }
-
   for (auto tmp : *t_allocations) {
-    VLOG(10) << "Delete temporary allocation " << tmp.second->ptr()
-             << " size: " << tmp.second->size();
-    delete tmp.second;
+    VLOG(10) << "Delete temporary allocation " << tmp->ptr()
+             << " size: " << tmp->size();
+    delete tmp;
   }
 }
 
@@ -62,34 +54,28 @@ void TemporaryAllocator::Free(alloc::Allocation *allocation) {
   auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
   PADDLE_ENFORCE_NOT_NULL(temp_allocation);
   if (platform::is_gpu_place(temp_allocation->place())) {
-    PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_),
-                   "The place should be the same.");
     size_t wait_delete_mem = 0;
     {
       std::unique_lock<std::mutex> lock(mtx_);
-      temp_mem_map_->emplace(temp_allocation->size(), temp_allocation);
+      temp_mem_queue_->emplace_back(temp_allocation);
       wait_delete_mem_ += temp_allocation->size();
       wait_delete_mem = wait_delete_mem_;
       VLOG(10) << "Move temporary allocation: " << temp_allocation->ptr()
                << " to delete queue: " << temp_allocation->size() << "; "
-               << "wait_delete_mem: " << wait_delete_mem;
+               << "wait_delete_mem: " << wait_delete_mem_;
     }
-
-    if (FLAGS_limit_of_tmp_allocation > 0 &&
-        wait_delete_mem > static_cast<size_t>(FLAGS_limit_of_tmp_allocation)) {
-      PADDLE_ENFORCE(callback_ != nullptr, "The callback is non-initialized.");
+    if (FLAGS_limit_of_temporary_allocation > 0 &&
+        wait_delete_mem > FLAGS_limit_of_temporary_allocation) {
       Release(callback_);
     }
     return;
   }
-  VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
-           << " size: " << temp_allocation->size();
   delete temp_allocation;
 }
 
 size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
   std::unique_lock<std::mutex> lock(mtx_);
-  return temp_mem_map_ ? temp_mem_map_->size() : 0;
+  return temp_mem_queue_ ? temp_mem_queue_->size() : 0;
 }
 
 void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
@@ -98,27 +84,6 @@ void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
 
 alloc::Allocation *TemporaryAllocator::AllocateImpl(
     size_t size, alloc::Allocator::Attr attr) {
-  {
-    // Find available allocation in temp_mem_map.
-    std::unique_lock<std::mutex> lock(mtx_);
-    if (temp_mem_map_->size()) {
-      auto it = temp_mem_map_->lower_bound(size);
-      // FIXME(zcd): Not sure the best value of excess fraction.
-      if (it != temp_mem_map_->end() &&
-          it->first <
-              static_cast<size_t>(
-                  size * FLAGS_times_excess_than_required_tmp_allocation)) {
-        auto tmp_ptr = it->second;
-        temp_mem_map_->erase(it);
-        wait_delete_mem_ -= tmp_ptr->size();
-        VLOG(10) << "Reuse temporary allocation: " << tmp_ptr->ptr() << ": "
-                 << tmp_ptr->size();
-        return tmp_ptr;
-      }
-    }
-  }
-  // If not find the the available allocation, get allocation from
-  // AllocatorFacadeInstance.
   auto raw_allocation =
       alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
   auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
index d657a14223..812c4a3331 100644
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <condition_variable>  // NOLINT
 #include <deque>
-#include <map>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
@@ -40,7 +39,7 @@ class TemporaryAllocation : public memory::allocation::Allocation {
  *
  * There is one opportunity to free the allocations of temp_allocation_queue:
  *   - when the allocation size of opportunities exceeds a certain threshold
- *     (defined by FLAGS_limit_of_tmp_allocation).
+ *     (defined by FLAGS_limit_of_temporary_allocation).
  *
  * */
 class TemporaryAllocator : public memory::allocation::Allocator {
@@ -63,10 +62,11 @@ class TemporaryAllocator : public memory::allocation::Allocator {
 
  private:
   platform::Place place_;
+
   // When the allocation is not held by any variable, it should be placed
-  // to temp_mem_map immediately.
-  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{
-      nullptr};
+  // to temp_mem_queue immediately.
+  std::shared_ptr<std::deque<TemporaryAllocation *>> temp_mem_queue_{nullptr};
+
   std::mutex mtx_;
   size_t wait_delete_mem_{0};
   std::function<void()> callback_;
diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc
index 3879cd5400..35d1d92981 100644
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -18,8 +18,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor_util.h"
 
-DECLARE_int64(limit_of_tmp_allocation);
-DECLARE_double(times_excess_than_required_tmp_allocation);
+DECLARE_double(limit_of_temporary_allocation);
 
 namespace paddle {
 namespace platform {
@@ -36,7 +35,7 @@ class DummyOp : public framework::OperatorBase {
                const platform::Place& place) const override {}
 };
 
-TEST(temporary_allocator, test_base_function) {
+TEST(temporary_allocator, temporary_allocator) {
   platform::CPUPlace cpu_place;
   TemporaryAllocator alloc(cpu_place);
   alloc.Allocate(100);
@@ -60,10 +59,10 @@ TEST(temporary_allocator, test_base_function) {
 #endif
 }
 
-TEST(temporary_allocator, test_flags_function) {
+TEST(temporary_allocator, add_callback) {
 #ifdef PADDLE_WITH_CUDA
-  const int64_t limit = FLAGS_limit_of_tmp_allocation;
-  FLAGS_limit_of_tmp_allocation = 10;
+  const double limit = FLAGS_limit_of_temporary_allocation;
+  FLAGS_limit_of_temporary_allocation = 10;
   platform::CUDAPlace gpu_place(0);
   TemporaryAllocator gpu_alloc(gpu_place);
 
@@ -79,52 +78,7 @@ TEST(temporary_allocator, test_flags_function) {
   });
   { gpu_alloc.Allocate(100); }
   PADDLE_ENFORCE(deleted);
-  FLAGS_limit_of_tmp_allocation = limit;
-#endif
-}
-
-TEST(temporary_allocator, test_reuse_tmp_allocation) {
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu_place(0);
-  TemporaryAllocator gpu_alloc(gpu_place);
-  gpu_alloc.SetCallback([]() {});
-
-  void* tmp_allocation_ptr1 = nullptr;
-  {
-    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
-    auto tmp_allocation1 = gpu_alloc.Allocate(100);
-    tmp_allocation_ptr1 = tmp_allocation1->ptr();
-  }
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
-  auto tmp_allocation2 = gpu_alloc.Allocate(100);
-  void* tmp_allocation_ptr2 = tmp_allocation2->ptr();
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
-  PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2);
-
-  auto tmp_allocation3 = gpu_alloc.Allocate(100);
-  void* tmp_allocation_ptr3 = tmp_allocation2->ptr();
-  PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr3);
-#endif
-}
-
-TEST(temporary_allocator, test_times_excess_than_required_tmp_allocation) {
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu_place(0);
-  TemporaryAllocator gpu_alloc(gpu_place);
-  gpu_alloc.SetCallback([]() {});
-  double excess_fraction = FLAGS_times_excess_than_required_tmp_allocation;
-  void* tmp_allocation_ptr1 = nullptr;
-  {
-    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
-    auto tmp_allocation1 =
-        gpu_alloc.Allocate(static_cast<size_t>(100 * excess_fraction - 1));
-    tmp_allocation_ptr1 = tmp_allocation1->ptr();
-  }
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
-  auto tmp_allocation2 = gpu_alloc.Allocate(100);
-  void* tmp_allocation_ptr2 = tmp_allocation2->ptr();
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
-  PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2);
+  FLAGS_limit_of_temporary_allocation = limit;
 #endif
 }
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 686550a3c8..2c17716500 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -155,8 +155,7 @@ def __bootstrap__():
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
             'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
-            'sync_nccl_allreduce', 'limit_of_tmp_allocation',
-            'times_excess_than_required_tmp_allocation'
+            'sync_nccl_allreduce'
         ]
 
     core.init_gflags([sys.argv[0]] +

From c4eced9881bd6a5c4f47739f4a83c422620f11ac Mon Sep 17 00:00:00 2001
From: chengduozh <zhaochengduo@baidu.com>
Date: Fri, 11 Jan 2019 12:52:03 +0800
Subject: [PATCH 21/22] fix thread safe bug test=develop

---
 paddle/fluid/platform/device_context.cc | 28 ++++++++++++-------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 09f3d3de54..8f80a2d782 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -92,26 +92,24 @@ platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
     const platform::Place& place, const cudaStream_t& stream) {
   PADDLE_ENFORCE(platform::is_gpu_place(place));
   auto place_stream = std::make_pair(place, stream);
-  {
-    std::unique_lock<std::mutex> lock(mtx_);
-    if (!device_allocator_.count(place_stream)) {
-      device_allocator_[place_stream].reset(new TemporaryAllocator(place));
-      device_allocator_[place_stream]->SetCallback([stream]() {
-        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-        PADDLE_ENFORCE(cudaGetLastError());
-      });
-    }
+  std::unique_lock<std::mutex> lock(mtx_);
+  auto it = device_allocator_.find(place_stream);
+  if (it == device_allocator_.end()) {
+    auto tmp_allocator = new TemporaryAllocator(place);
+    tmp_allocator->SetCallback([stream]() {
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE(cudaGetLastError());
+    });
+    device_allocator_[place_stream].reset(tmp_allocator);
+    return *tmp_allocator;
+  } else {
+    return *it->second;
   }
-  return *device_allocator_.at(place_stream);
 }
 
 template <>
 platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
     const platform::CUDADeviceContext& dev_ctx) {
-  auto place_stream = std::make_pair(dev_ctx.GetPlace(), dev_ctx.stream());
-  if (device_allocator_.count(place_stream)) {
-    return *device_allocator_.at(place_stream);
-  }
   return Get(dev_ctx.GetPlace(), dev_ctx.stream());
 }
 #endif
@@ -325,7 +323,7 @@ Place CUDADeviceContext::GetPlace() const { return place_; }
 void CUDADeviceContext::Wait() const {
   auto& allocator =
       DeviceTemporaryAllocator::Instance().Get<CUDADeviceContext>(*this);
-  allocator.Release([=]() {
+  allocator.Release([this]() {
     PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
     PADDLE_ENFORCE(cudaGetLastError());
   });

From 98e85f373578073657ef7c5a3efc4205e5353fce Mon Sep 17 00:00:00 2001
From: Zhaolong Xing <ZLX_HG@163.com>
Date: Fri, 11 Jan 2019 13:00:35 +0800
Subject: [PATCH 22/22] add_transpose_flatten_concat_fuse  (#15121)

---
 paddle/fluid/framework/ir/CMakeLists.txt      |  11 ++
 .../framework/ir/graph_pattern_detector.cc    |  63 ++++++++
 .../framework/ir/graph_pattern_detector.h     |  15 ++
 .../ir/transpose_flatten_concat_fuse_pass.cc  | 148 ++++++++++++++++++
 .../ir/transpose_flatten_concat_fuse_pass.h   |  38 +++++
 paddle/fluid/inference/api/analysis_config.cc |   1 +
 .../fluid/inference/api/paddle_pass_builder.h |   4 +
 .../tensorrt/convert/elementwise_op.cc        |  37 ++++-
 8 files changed, 311 insertions(+), 6 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index a595a8ab42..42fb6a1aa5 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -48,6 +48,17 @@ pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
+pass_library(transpose_flatten_concat_fuse_pass inference)
+
+# There may be many transpose-flatten structures in a model, and the output of
+# these structures will be used as inputs to the concat Op. This pattern will
+# be detected by our pass. The index here represents the number of structures in the
+# pattern. We use index 3 ~ 6, because these quantities of structures are
+# common in the models.
+foreach (index RANGE 3 6)
+   file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
+endforeach()
+
 if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base)
     pass_library(depthwise_conv_mkldnn_pass base)
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index c513fe2dd8..6282ced1e4 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()(
   return ac_out_var;
 }
 
+// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
+// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
+// ...
+// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z
+// flatten_out_a -> concat_op  flatten_out_b -> concat_op ... flatten_out_z ->
+// concat_op
+PDNode *patterns::TransposeFlattenConcat::operator()(
+    std::vector<PDNode *> conv_in, int times) {
+  // The times represents the repeat times of the
+  // {trans, trans_out, flatten, flatten_out}
+  const int kNumFields = 4;
+  const int kTransOutOffset = 1;
+  const int kFlattenOffset = 2;
+  const int kFlattenOutOffset = 3;
+
+  std::vector<PDNode *> nodes;
+
+  for (int i = 0; i < times; i++) {
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("transpose" + std::to_string(i)))
+            ->assert_is_op("transpose2"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("transpose_out" + std::to_string(i)))
+            ->assert_is_op_output("transpose2")
+            ->assert_is_op_input("flatten2", "X")
+            ->AsIntermediate());
+    nodes.push_back(pattern->NewNode(GetNodeName("flatten" + std::to_string(i)))
+                        ->assert_is_op("flatten2"));
+
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("flatten_out" + std::to_string(i)))
+            ->assert_is_op_output("flatten2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+  }
+
+  auto concat_op = pattern->NewNode(GetNodeName("concat"))
+                       ->assert_is_op("concat")
+                       ->assert_op_has_n_inputs("concat", times);
+  auto concat_out = pattern->NewNode(GetNodeName("concat_out"))
+                        ->assert_is_op_output("concat")
+                        ->AsOutput();
+
+  std::vector<PDNode *> flatten_outs;
+  for (int i = 0; i < times; i++) {
+    conv_in[i]->AsInput();
+    // trans
+    nodes[i * kNumFields]->LinksFrom({conv_in[i]});
+    // trans_out
+    nodes[i * kNumFields + kTransOutOffset]->LinksFrom({nodes[i * kNumFields]});
+    // flatten
+    nodes[i * kNumFields + kFlattenOffset]->LinksFrom(
+        {nodes[i * kNumFields + kTransOutOffset]});
+    // flatten_out
+    nodes[i * kNumFields + kFlattenOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kFlattenOffset]});
+    flatten_outs.push_back(nodes[i * kNumFields + kFlattenOutOffset]);
+  }
+
+  concat_op->LinksFrom(flatten_outs).LinksTo({concat_out});
+  return concat_out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 61a5300344..c8be586f54 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase {
   PATTERN_DECL_NODE(ac_out);  // Out
 };
 
+struct TransposeFlattenConcat : public PatternBase {
+  TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "transpose_flatten_concat") {}
+
+  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
+
+  std::string GetNodeName(const std::string& op_type) {
+    return PDNodeName(name_scope_, repr_, id_, op_type);
+  }
+
+  PDNode* GetPDNode(const std::string& op_type) {
+    return pattern->RetrieveNode(GetNodeName(op_type));
+  }
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
new file mode 100644
index 0000000000..fda43948d5
--- /dev/null
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+template <int times>
+std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name =
+      "transpose_flatten" + std::to_string(times) + "_concat_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  std::vector<PDNode *> input_nodes;
+  for (int i = 0; i < times; i++) {
+    input_nodes.push_back(gpd.mutable_pattern()
+                              ->NewNode("x" + std::to_string(i))
+                              ->assert_is_op_input("transpose2", "X")
+                              ->AsInput());
+  }
+
+  patterns::TransposeFlattenConcat pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(input_nodes, times);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    const int kNumFields = 5;
+    const int kTransOffset = 1;
+    const int kTransOutOffset = 2;
+    const int kFlattenOffset = 3;
+    const int kFlattenOutOffset = 4;
+    std::vector<Node *> nodes;
+
+    for (int i = 0; i < times; i++) {
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
+      PADDLE_ENFORCE(subgraph.at(input_nodes[i]));
+
+      nodes.push_back(subgraph.at(input_nodes[i]));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
+    }
+
+    Node *concat_op = subgraph.at(pattern.GetPDNode("concat"));
+    Node *concat_out = subgraph.at(pattern.GetPDNode("concat_out"));
+    std::vector<std::string> input_names;
+    std::vector<int> trans_axis = boost::get<std::vector<int>>(
+        nodes[kTransOffset]->Op()->GetAttr("axis"));
+    int flatten_axis =
+        boost::get<int>(nodes[kFlattenOffset]->Op()->GetAttr("axis"));
+    int concat_axis = boost::get<int>(concat_op->Op()->GetAttr("axis"));
+    std::string output_name = concat_out->Name();
+
+    for (int i = 0; i < times; i++) {
+      input_names.push_back(nodes[i * kNumFields]->Name());
+    }
+
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("fusion_transpose_flatten_concat");
+    new_op_desc.SetInput("X", input_names);
+    new_op_desc.SetAttr("trans_axis", trans_axis);
+    new_op_desc.SetAttr("flatten_axis", flatten_axis);
+    new_op_desc.SetAttr("concat_axis", concat_axis);
+    new_op_desc.SetOutput("Out", {output_name});
+    new_op_desc.Flush();
+
+    // Create a new node for the fused op.
+    auto *new_conv_op = graph->CreateOpNode(&new_op_desc);
+
+    std::unordered_set<const Node *> delete_nodes;
+
+    for (int i = 0; i < times; i++) {
+      nodes[i * kNumFields]->outputs.push_back(new_conv_op);
+      new_conv_op->inputs.push_back(nodes[i * kNumFields]);
+      delete_nodes.insert(nodes[i * kNumFields + kTransOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kTransOutOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kFlattenOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kFlattenOutOffset]);
+    }
+    delete_nodes.insert(concat_op);
+
+    new_conv_op->outputs.push_back(concat_out);
+    concat_out->inputs.push_back(new_conv_op);
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), delete_nodes);
+  };
+
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+template class TransposeFlattenConcatFusePass<1>;
+template class TransposeFlattenConcatFusePass<3>;
+template class TransposeFlattenConcatFusePass<4>;
+template class TransposeFlattenConcatFusePass<5>;
+template class TransposeFlattenConcatFusePass<6>;
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(transpose_flatten_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
+
+REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
+
+REGISTER_PASS(transpose_flatten4_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<4>);
+
+REGISTER_PASS(transpose_flatten5_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<5>);
+
+REGISTER_PASS(transpose_flatten6_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<6>);
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
new file mode 100644
index 0000000000..fb0f0ae9ef
--- /dev/null
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// There may be many transpose-flatten structures in a model, and the output of
+// these structures will be used as inputs to the concat Op. This pattern will
+// be detected by our pass. The times here represents the repeat times of this
+// structure.
+template <int times>
+class TransposeFlattenConcatFusePass : public FusePassBase {
+ public:
+  virtual ~TransposeFlattenConcatFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 211c691504..336ab426c2 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
   use_tensorrt_ = true;
   tensorrt_workspace_size_ = workspace_size;
   tensorrt_max_batchsize_ = max_batch_size;
+  Update();
 }
 
 void contrib::AnalysisConfig::Update() {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 1e5712e163..de9650735a 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -141,6 +141,10 @@ class GpuPassStrategy : public PassStrategy {
         "conv_elementwise_add_fuse_pass",            //
     });
 
+    for (int i = 6; i >= 3; i--) {
+      passes_.push_back("transpose_flatten" + std::to_string(i) +
+                        "_concat_fuse_pass");
+    }
     use_gpu_ = true;
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 6975086193..79362f9677 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
+    nvinfer1::ILayer* layer = nullptr;
     framework::OpDesc op_desc(op, nullptr);
     VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
 
@@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter {
                                          0};
     TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                          0};
+    if (op_type_ == "add") {
+      nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Scale, *X, scale_mode, shift_weights.get(),
+          scale_weights.get(), power_weights.get());
+      layer = scale_layer;
+    } else if (op_type_ == "mul") {
+      nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Scale, *X, scale_mode, scale_weights.get(),
+          shift_weights.get(), power_weights.get());
+      layer = scale_layer;
+    }
 
-    nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
-        shift_weights.get(), scale_weights.get(), power_weights.get());
     auto output_name = op_desc.Output("Out")[0];
-
-    layer->setName(("elementwise_add (Output: " + output_name + ")").c_str());
+    layer->setName(
+        ("elementwise_" + op_type_ + "(Output: " + output_name + ")").c_str());
     layer->getOutput(0)->setName(output_name.c_str());
     engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
     engine_->SetITensor(output_name, layer->getOutput(0));
@@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
       engine_->DeclareOutput(output_name);
     }
   }
+
+ protected:
+  std::string op_type_;
 };
 
 class ElementwiseTensorOpConverter : public OpConverter {
@@ -188,6 +200,16 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
         {"max", nvinfer1::ElementWiseOperation::kMAX},
 };
 
+class ElementwiseWeightAddOpConverter : public ElementwiseWeightOpConverter {
+ public:
+  ElementwiseWeightAddOpConverter() { op_type_ = "add"; }
+};
+
+class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter {
+ public:
+  ElementwiseWeightMulOpConverter() { op_type_ = "mul"; }
+};
+
 class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
  public:
   ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
@@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_add_weight,
+                          ElementwiseWeightAddOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight,
+                          ElementwiseWeightMulOpConverter);
 
 REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
                           ElementwiseTensorAddOpConverter);