From 3e13b9122fef2ac8dc53bad340ad41902f2a4ccd Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Tue, 5 Sep 2017 12:18:51 +0800
Subject: [PATCH 01/51] add softmax_with_cost_op.

---
 paddle/operators/softmax_with_cost_op.cc      | 82 +++++++++++++++++++
 paddle/operators/softmax_with_cost_op.h       | 40 +++++++++
 paddle/pybind/pybind.cc                       |  1 +
 .../tests/test_softmax_with_cost_op.py        | 22 +++++
 4 files changed, 145 insertions(+)
 create mode 100644 paddle/operators/softmax_with_cost_op.cc
 create mode 100644 paddle/operators/softmax_with_cost_op.h
 create mode 100644 python/paddle/v2/framework/tests/test_softmax_with_cost_op.py
diff --git a/paddle/operators/softmax_with_cost_op.cc b/paddle/operators/softmax_with_cost_op.cc
new file mode 100644
index 0000000000..a4537691a0
--- /dev/null
+++ b/paddle/operators/softmax_with_cost_op.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+:A
+limitations under the License. */
+
+#include "paddle/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxWithLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto logits = ctx.Input<Tensor>("logits");
+    PADDLE_ENFORCE(logits->dims().size() == 2UL,
+                   "The input of softmax_with_loss_op should be a 2-d tensor.");
+    PADDLE_ENFORCE(ctx.Input<Tensor>("lables")->dims().size() == 1UL,
+                   "The label should be a 1-d tensor.");
+    ctx.Output<Tensor>("loss")->Resize({logits->dims()[0]});
+  }
+};
+
+class SoftmaxWithLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftmaxWithLossOpMaker(framework::OpProto *proto,
+                         framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("logits",
+             "The unscaled log probabilities which is a 2-D tensor<float> with"
+             "shape [N x K]. N is the batch_size, and K is the class number.");
+    AddInput("label", "The ground truth. A 1-D tensor<int> with shape N.");
+    AddOutput("loss", "A 1-D tensor<float> with shape N.");
+    AddComment(R"DOC(
+Cross entropy loss with softmax are used as the output layer extensively. This
+operator computes the softmax normalized values for each row of the input
+tensor, after which cross-entropy loss is then computed. This provides a more
+numerically stable gradient.
+
+Because this operators performs a softmax on logits internally, it expects
+unscaled logits. Please do not call this op with the output of softmax operator,
+which will produce incorrect results.
+
+This operators expects mutually exclusive hard labels, each sample in a batch
+is in exactly one class with probabilities 1. Each sample in the batch with one
+and only one label.
+)DOC");
+  }
+};
+
+class SoftmaxWithLossOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(softmax, ops::SoftmaxWithLossOp, ops::SoftmaxWithLossOpMaker,
+            softmax_grad, ops::SoftmaxWithLossOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    softmax, ops::SoftmaxWithLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    softmax_grad,
+    ops::SoftmaxWithLossGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/softmax_with_cost_op.h b/paddle/operators/softmax_with_cost_op.h
new file mode 100644
index 0000000000..fb544842b7
--- /dev/null
+++ b/paddle/operators/softmax_with_cost_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class SoftmaxWithLossKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {}
+};
+
+template <typename Place, typename T>
+class SoftmaxWithLossGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 6896422617..e86f4dfe26 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -37,6 +37,7 @@ USE_OP(mul);
 USE_OP(mean);
 USE_OP(sigmoid);
 USE_OP(softmax);
+USE_OP(softmax_with_loss);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
 USE_NO_KERNEL_OP(recurrent);
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cost_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cost_op.py
new file mode 100644
index 0000000000..f7b9f54a91
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cost_op.py
@@ -0,0 +1,22 @@
+import unittest
+
+import numpy as np
+
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+
+class TestSoftmaxWithLossOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        pass
+
+
+class SoftmaxWithLossGradOpTest(GradientChecker):
+    def test_softmax(self):
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()

From 513bc99702e8d8fd36c34de3aa813c0229442d6b Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Fri, 8 Sep 2017 18:49:40 +0800
Subject: [PATCH 02/51] softmax with cross entropy as a cost operator.

---
 paddle/operators/softmax_with_cost_op.cc      |  82 --------------
 .../softmax_with_cross_entropy_op.cc          | 102 ++++++++++++++++++
 .../softmax_with_cross_entropy_op.cu          |  25 +++++
 ...t_op.h => softmax_with_cross_entropy_op.h} |  22 ++--
 paddle/pybind/pybind.cc                       |   2 +-
 5 files changed, 139 insertions(+), 94 deletions(-)
 delete mode 100644 paddle/operators/softmax_with_cost_op.cc
 create mode 100644 paddle/operators/softmax_with_cross_entropy_op.cc
 create mode 100644 paddle/operators/softmax_with_cross_entropy_op.cu
 rename paddle/operators/{softmax_with_cost_op.h => softmax_with_cross_entropy_op.h} (51%)

diff --git a/paddle/operators/softmax_with_cost_op.cc b/paddle/operators/softmax_with_cost_op.cc
deleted file mode 100644
index a4537691a0..0000000000
--- a/paddle/operators/softmax_with_cost_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-:A
-limitations under the License. */
-
-#include "paddle/operators/softmax_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SoftmaxWithLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto logits = ctx.Input<Tensor>("logits");
-    PADDLE_ENFORCE(logits->dims().size() == 2UL,
-                   "The input of softmax_with_loss_op should be a 2-d tensor.");
-    PADDLE_ENFORCE(ctx.Input<Tensor>("lables")->dims().size() == 1UL,
-                   "The label should be a 1-d tensor.");
-    ctx.Output<Tensor>("loss")->Resize({logits->dims()[0]});
-  }
-};
-
-class SoftmaxWithLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SoftmaxWithLossOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("logits",
-             "The unscaled log probabilities which is a 2-D tensor<float> with"
-             "shape [N x K]. N is the batch_size, and K is the class number.");
-    AddInput("label", "The ground truth. A 1-D tensor<int> with shape N.");
-    AddOutput("loss", "A 1-D tensor<float> with shape N.");
-    AddComment(R"DOC(
-Cross entropy loss with softmax are used as the output layer extensively. This
-operator computes the softmax normalized values for each row of the input
-tensor, after which cross-entropy loss is then computed. This provides a more
-numerically stable gradient.
-
-Because this operators performs a softmax on logits internally, it expects
-unscaled logits. Please do not call this op with the output of softmax operator,
-which will produce incorrect results.
-
-This operators expects mutually exclusive hard labels, each sample in a batch
-is in exactly one class with probabilities 1. Each sample in the batch with one
-and only one label.
-)DOC");
-  }
-};
-
-class SoftmaxWithLossOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP(softmax, ops::SoftmaxWithLossOp, ops::SoftmaxWithLossOpMaker,
-            softmax_grad, ops::SoftmaxWithLossOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    softmax, ops::SoftmaxWithLossKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    softmax_grad,
-    ops::SoftmaxWithLossGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
new file mode 100644
index 0000000000..2edf00766e
--- /dev/null
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/softmax_with_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto logits = ctx.Input<Tensor>("logits");
+    PADDLE_ENFORCE(
+        logits->dims().size() == 2UL,
+        "The input of softmax_with_cross_entropy should be a 2-d tensor.");
+    PADDLE_ENFORCE(ctx.Input<Tensor>("lables")->dims().size() == 1UL,
+                   "The label should be a 1-d tensor.");
+    ctx.Output<Tensor>("Y")->Resize({logits->dims()[0]});
+  }
+};
+
+class SoftmaxWithCrossEntropyOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftmaxWithCrossEntropyOpMaker(framework::OpProto *proto,
+                                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("logits",
+             "The unscaled log probabilities which is a 2-D tensor<float> with"
+             "shape [N x K]. N is the batch_size, and K is the class number.");
+    AddInput("label", "The ground truth. A 1-D tensor<int> with shape N.");
+    AddOutput("Y", "A 1-D tensor<float> with shape N.");
+    AddComment(R"DOC(
+Cross entropy loss with softmax are used as the output layer extensively. This
+operator computes the softmax normalized values for each row of the input
+tensor, after which cross-entropy loss is then computed. This provides a more
+numerically stable gradient.
+
+Because this operators performs a softmax on logits internally, it expects
+unscaled logits. Please do not call this op with the output of softmax operator,
+which will produce incorrect results.
+
+This operators expects mutually exclusive hard labels, each sample in a batch
+is in exactly one class with probabilities 1. Each sample in the batch with one
+and only one label.
+)DOC");
+  }
+};
+
+class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
+                            "Input(Y@GRAD) should be not null.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Y")->dims(),
+                      ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
+                      "Input(Y) and its gradients should have a same shape.");
+
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("labels"),
+                            "Input(lables) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("logits")),
+                            "Input(logits@GRAD) should be not null.");
+    PADDLE_ENFORCE_EQ(
+        ctx.Input<Tensor>("logits")->dims(),
+        ctx.Input<Tensor>(framework::GradVarName("logits"))->dims(),
+        "Input(logits) and its gradients should have a same shape.");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
+            ops::SoftmaxWithCrossEntropyOpMaker,
+            softmax_with_cross_entropy_grad,
+            ops::SoftmaxWithCrossEntropyOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    softmax_with_cross_entropy,
+    ops::SoftmaxWithCrossEntropyKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    softmax_with_cross_entropy_grad,
+    ops::SoftmaxWithCrossEntropyGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
new file mode 100644
index 0000000000..c9d47cc4aa
--- /dev/null
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "softmax_with_cross_entropy_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    softmax_with_cross_entropy,
+    ops::SoftmaxWithCrossEntropyKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    softmax_with_cross_entropy_grad,
+    ops::SoftmaxWithCrossEntropyGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/softmax_with_cost_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
similarity index 51%
rename from paddle/operators/softmax_with_cost_op.h
rename to paddle/operators/softmax_with_cross_entropy_op.h
index fb544842b7..418fb540b8 100644
--- a/paddle/operators/softmax_with_cost_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -25,13 +25,13 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class SoftmaxWithLossKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {}
 };
 
 template <typename Place, typename T>
-class SoftmaxWithLossGradKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {}
 };
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 7d363b3108..a059cd0b81 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -37,7 +37,7 @@ USE_OP(mul);
 USE_OP(mean);
 USE_OP(sigmoid);
 USE_OP(softmax);
-USE_OP(softmax_with_loss);
+USE_OP(softmax_with_cross_entropy);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
 USE_NO_KERNEL_OP(recurrent);

From c0cef849b6971657b3d3396578e18824ec926e15 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Tue, 12 Sep 2017 00:17:11 +0800
Subject: [PATCH 03/51] softmax as function.

---
 paddle/operators/CMakeLists.txt           |  4 +-
 paddle/operators/math/CMakeLists.txt      |  8 +--
 paddle/operators/math/softmax_function.cc | 63 +++++++++++++++++++++++
 paddle/operators/math/softmax_function.h  | 29 +++++++++++
 paddle/operators/softmax_op.h             | 32 ++----------
 5 files changed, 103 insertions(+), 33 deletions(-)
 create mode 100644 paddle/operators/math/softmax_function.cc
 create mode 100644 paddle/operators/math/softmax_function.h

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index f9ea25ab04..94e00ac382 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -55,10 +55,12 @@ set(DEPS_OPS
     minus_op
     mul_op
     recurrent_op
-    scale_op)
+    scale_op
+    softmax_op)
 op_library(identity_op DEPS scale_op)
 op_library(minus_op DEPS scale_op)
 op_library(mul_op DEPS math_function)
+op_library(softmax_op DEPS math_function)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor operator net_op)
 op_library(scale_op DEPS net_op)
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index f8333f34f7..8ce39db621 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,9 +1,9 @@
-
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc 
-    im2col.cu DEPS cblas device_context)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
+      im2col.cu softmax_function.cc DEPS cblas device_context operator)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context)
+    cc_library(math_function SRCS math_function.cc im2col.cc
+      softmax_function.cc DEPS cblas device_context operator)
 endif()
 
 nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/softmax_function.cc b/paddle/operators/math/softmax_function.cc
new file mode 100644
index 0000000000..7edb632d31
--- /dev/null
+++ b/paddle/operators/math/softmax_function.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+#define EIGEN_USE_GPU
+#endif
+
+#include "paddle/operators/math/softmax_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+void softmax(const framework::Tensor* X, framework::Tensor* Y,
+             const framework::ExecutionContext& context) {
+  auto logits = EigenMatrix<T>::From(*X);
+  auto softmax = EigenMatrix<T>::From(*Y);
+
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
+
+  const int batch_size = logits.dimension(kBatchDim);
+  const int num_classes = logits.dimension(kClassDim);
+
+  Eigen::DSizes<int, 1> along_class(kClassDim);
+  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+  auto shifted_logits = (logits -
+                         logits.maximum(along_class)
+                             .eval()
+                             .reshape(batch_by_one)
+                             .broadcast(one_by_class));
+
+  softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
+  softmax.device(context.GetEigenDevice<Place>()) =
+      (softmax *
+       softmax.sum(along_class)
+           .inverse()
+           .eval()
+           .reshape(batch_by_one)
+           .broadcast(one_by_class));
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax_function.h b/paddle/operators/math/softmax_function.h
new file mode 100644
index 0000000000..2e1b2a7ad0
--- /dev/null
+++ b/paddle/operators/math/softmax_function.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename Place, typename T>
+void softmax(const framework::Tensor* X, framework::Tensor* Y,
+             const framework::ExecutionContext& context);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 8a3a5ab927..ff054a59ae 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/softmax_function.h"
 
 namespace paddle {
 namespace operators {
@@ -30,36 +31,11 @@ class SoftmaxKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     auto X = context.Input<Tensor>("X");
     auto Y = context.Output<Tensor>("Y");
-    Y->mutable_data<T>(context.GetPlace());
-
-    auto logits = EigenMatrix<T>::From(*X);
-    auto softmax = EigenMatrix<T>::From(*Y);
-
-    const int kBatchDim = 0;
-    const int kClassDim = 1;
 
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
+    // allocate memory on device.
+    Y->mutable_data<T>(context.GetPlace());
 
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-
-    auto shifted_logits = (logits -
-                           logits.maximum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
-
-    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
-
-    softmax.device(context.GetEigenDevice<Place>()) =
-        (softmax *
-         softmax.sum(along_class)
-             .inverse()
-             .eval()
-             .reshape(batch_by_one)
-             .broadcast(one_by_class));
+    math::softmax<Place, T>(X, Y, context);
   }
 };
 

From c6366c819e668c21a822122086ad72008357dd66 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Tue, 12 Sep 2017 16:38:11 +0800
Subject: [PATCH 04/51] softmax as functor.

---
 paddle/operators/CMakeLists.txt               |  2 +-
 paddle/operators/cross_entropy_op.h           | 28 ++-------
 paddle/operators/math/CMakeLists.txt          |  7 ++-
 paddle/operators/math/softmax_function.cc     | 58 ++++---------------
 paddle/operators/math/softmax_function.cu     | 27 +++++++++
 paddle/operators/math/softmax_function.h      | 57 ++++++++++++++----
 paddle/operators/softmax_op.h                 |  2 +-
 .../softmax_with_cross_entropy_op.cc          | 44 +++++++-------
 .../operators/softmax_with_cross_entropy_op.h | 27 ++++++++-
 .../framework/tests/test_cross_entropy_op.py  | 13 +++--
 .../tests/test_softmax_with_cost_op.py        | 22 -------
 .../test_softmax_with_cross_entropy_op.py     | 39 +++++++++++++
 12 files changed, 192 insertions(+), 134 deletions(-)
 create mode 100644 paddle/operators/math/softmax_function.cu
 delete mode 100644 python/paddle/v2/framework/tests/test_softmax_with_cost_op.py
 create mode 100644 python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 94e00ac382..8863ffe8e3 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -60,7 +60,7 @@ set(DEPS_OPS
 op_library(identity_op DEPS scale_op)
 op_library(minus_op DEPS scale_op)
 op_library(mul_op DEPS math_function)
-op_library(softmax_op DEPS math_function)
+op_library(softmax_op DEPS softmax_function)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor operator net_op)
 op_library(scale_op DEPS net_op)
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index eb4d1348de..6de23bbe00 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -14,31 +14,13 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/utils.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T>
-inline T tolerable_value(const T x) {
-  static_assert(std::is_floating_point<T>::value,
-                "tolerable_value works only on float, "
-                "double and double double.");
-
-  const T kApproInf = 1e20;
-
-  if (x == INFINITY) {
-    return kApproInf;
-  }
-
-  if (x == -INFINITY) {
-    return -kApproInf;
-  }
-
-  return x;
-}
-
 template <typename T>
 class OnehotCrossEntropyOpKernel : public framework::OpKernel {
  public:
@@ -55,12 +37,12 @@ class OnehotCrossEntropyOpKernel : public framework::OpKernel {
 
     T* Ydata = Y->data<T>();
 
-    int batch_size = X->dims()[0];
-    int class_num = X->dims()[1];
+    const int batch_size = X->dims()[0];
+    const int class_num = X->dims()[1];
 
     for (int i = 0; i < batch_size; ++i) {
       int index = i * class_num + label_data[i];
-      Ydata[i] = -tolerable_value(std::log(Xdata[index]));
+      Ydata[i] = -math::tolerable_value(std::log(Xdata[index]));
     }
   }
 };
@@ -89,7 +71,7 @@ class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
     memset(dXdata, 0, sizeof(T) * batch_size * class_num);
     for (int i = 0; i < batch_size; ++i) {
       int index = i * class_num + label_data[i];
-      dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]);
+      dXdata[index] = -math::tolerable_value(dYdata[i] / Xdata[index]);
     }
   }
 };
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 8ce39db621..832a954e3a 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,9 +1,12 @@
 if(WITH_GPU)
     nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
-      im2col.cu softmax_function.cc DEPS cblas device_context operator)
+      im2col.cu DEPS cblas device_context operator)
+    nv_library(softmax_function SRCS softmax_function.cc softmax_function.cu
+      DEPS operator)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc
-      softmax_function.cc DEPS cblas device_context operator)
+      DEPS cblas device_context operator)
+    cc_library(softmax_function SRCS softmax_function.cc DEPS operator)
 endif()
 
 nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/softmax_function.cc b/paddle/operators/math/softmax_function.cc
index 7edb632d31..cd46ed96ca 100644
--- a/paddle/operators/math/softmax_function.cc
+++ b/paddle/operators/math/softmax_function.cc
@@ -1,20 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_ONLY_CPU
-#define EIGEN_USE_GPU
-#endif
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #include "paddle/operators/math/softmax_function.h"
 
@@ -22,41 +18,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename Place, typename T>
-void softmax(const framework::Tensor* X, framework::Tensor* Y,
-             const framework::ExecutionContext& context) {
-  auto logits = EigenMatrix<T>::From(*X);
-  auto softmax = EigenMatrix<T>::From(*Y);
-
-  const int kBatchDim = 0;
-  const int kClassDim = 1;
-
-  const int batch_size = logits.dimension(kBatchDim);
-  const int num_classes = logits.dimension(kClassDim);
-
-  Eigen::DSizes<int, 1> along_class(kClassDim);
-  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-
-  auto shifted_logits = (logits -
-                         logits.maximum(along_class)
-                             .eval()
-                             .reshape(batch_by_one)
-                             .broadcast(one_by_class));
-
-  softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
-  softmax.device(context.GetEigenDevice<Place>()) =
-      (softmax *
-       softmax.sum(along_class)
-           .inverse()
-           .eval()
-           .reshape(batch_by_one)
-           .broadcast(one_by_class));
-}
+template class SoftmaxFunctor<platform::CPUPlace, float>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax_function.cu b/paddle/operators/math/softmax_function.cu
new file mode 100644
index 0000000000..486697a161
--- /dev/null
+++ b/paddle/operators/math/softmax_function.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/math/softmax_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class SoftmaxFunctor<platform::GPUPlace, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax_function.h b/paddle/operators/math/softmax_function.h
index 2e1b2a7ad0..ce29a69bce 100644
--- a/paddle/operators/math/softmax_function.h
+++ b/paddle/operators/math/softmax_function.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -21,9 +21,44 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-void softmax(const framework::Tensor* X, framework::Tensor* Y,
-             const framework::ExecutionContext& context);
+class SoftmaxFunctor {
+ public:
+  void operator()(const framework::Tensor* X, framework::Tensor* Y,
+                  const framework::ExecutionContext& context) {
+    auto logits = EigenMatrix<T>::From(*X);
+    auto softmax = EigenMatrix<T>::From(*Y);
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto shifted_logits = (logits -
+                           logits.maximum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class));
+
+    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
+    softmax.device(context.GetEigenDevice<Place>()) =
+        (softmax *
+         softmax.sum(along_class)
+             .inverse()
+             .eval()
+             .reshape(batch_by_one)
+             .broadcast(one_by_class));
+  }
+};
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index ff054a59ae..6d14542a72 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -35,7 +35,7 @@ class SoftmaxKernel : public framework::OpKernel {
     // allocate memory on device.
     Y->mutable_data<T>(context.GetPlace());
 
-    math::softmax<Place, T>(X, Y, context);
+    math::SoftmaxFunctor<Place, T>()(X, Y, context);
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 2edf00766e..b4aa9aab4b 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -23,13 +23,13 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto logits = ctx.Input<Tensor>("logits");
+    auto logits = ctx.Input<Tensor>("Logits");
     PADDLE_ENFORCE(
         logits->dims().size() == 2UL,
         "The input of softmax_with_cross_entropy should be a 2-d tensor.");
-    PADDLE_ENFORCE(ctx.Input<Tensor>("lables")->dims().size() == 1UL,
+    PADDLE_ENFORCE(ctx.Input<Tensor>("Label")->dims().size() == 1UL,
                    "The label should be a 1-d tensor.");
-    ctx.Output<Tensor>("Y")->Resize({logits->dims()[0]});
+    ctx.Output<Tensor>("Label")->Resize({logits->dims()[0]});
   }
 };
 
@@ -39,11 +39,15 @@ class SoftmaxWithCrossEntropyOpMaker
   SoftmaxWithCrossEntropyOpMaker(framework::OpProto *proto,
                                  framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("logits",
+    AddInput("Logits",
              "The unscaled log probabilities which is a 2-D tensor<float> with"
              "shape [N x K]. N is the batch_size, and K is the class number.");
-    AddInput("label", "The ground truth. A 1-D tensor<int> with shape N.");
-    AddOutput("Y", "A 1-D tensor<float> with shape N.");
+    AddInput("Label", "The ground truth. A 1-D tensor<int> with shape N.");
+    AddOutput("Softmax",
+              "Store the outputs of softmax function, "
+              "which will be used in backward calculation.")
+        .AsIntermediate();
+    AddOutput("Loss", "A 1-D tensor<float> with shape N.");
     AddComment(R"DOC(
 Cross entropy loss with softmax are used as the output layer extensively. This
 operator computes the softmax normalized values for each row of the input
@@ -67,21 +71,21 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should be not null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
-                            "Input(Y@GRAD) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Y")->dims(),
-                      ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
-                      "Input(Y) and its gradients should have a same shape.");
-
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("labels"),
-                            "Input(lables) should be not null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("logits")),
-                            "Input(logits@GRAD) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Loss"),
+                            "Input(Loss) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Loss")),
+                            "Input(Loss@GRAD) should be not null.");
     PADDLE_ENFORCE_EQ(
-        ctx.Input<Tensor>("logits")->dims(),
-        ctx.Input<Tensor>(framework::GradVarName("logits"))->dims(),
-        "Input(logits) and its gradients should have a same shape.");
+        ctx.Input<Tensor>("Logits")->dims(),
+        ctx.Input<Tensor>(framework::GradVarName("Logits"))->dims(),
+        "Input(Logits) and its gradients should have a same shape.");
+    PADDLE_ENFORCE_EQ(
+        ctx.Input<Tensor>("Logits")->dims(),
+        ctx.Input<Tensor>(framework::GradVarName("Logits"))->dims(),
+        "Input(Logits) and its gradients should have a same shape.");
+
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Lable) should be not null.");
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index 418fb540b8..4c019a7599 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -15,6 +15,8 @@
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/softmax_function.h"
+#include "paddle/operators/math/utils.h"
 
 namespace paddle {
 namespace operators {
@@ -27,7 +29,30 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
 class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {}
+  void Compute(const framework::ExecutionContext& context) const override {
+    // Calculate ths softmax outputs.
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    Tensor* softmax = context.Output<Tensor>("Softmax");
+    // allocate memory on device.
+    softmax->mutable_data<T>(context.GetPlace());
+    math::SoftmaxFunctor<Place, T>()(logits, softmax, context);
+
+    // Calculate the cross entropy loss based on hard labels.
+    T* softmax_out = softmax->data<T>();
+    const int* label_data = context.Input<Tensor>("label")->data<int>();
+
+    Tensor* loss = context.Output<Tensor>("Loss");
+    loss->mutable_data<T>(context.GetPlace());
+    T* loss_data = loss->data<T>();
+
+    const int batch_size = logits->dims()[0];
+    const int class_num = logits->dims()[1];
+
+    for (int i = 0; i < batch_size; ++i) {
+      int index = i * class_num + label_data[i];
+      loss_data[i] = -math::tolerable_value(std::log(softmax_out[index]));
+    }
+  }
 };
 
 template <typename Place, typename T>
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index c2fc102a8b..6c1dc4044f 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -1,6 +1,7 @@
 import unittest
 import numpy
 from op_test import OpTest
+import pdb
 
 
 class TestCrossEntropy(OpTest):
@@ -10,18 +11,20 @@ class TestCrossEntropy(OpTest):
         class_num = 10
         X = numpy.random.uniform(0.1, 1.0,
                                  [batch_size, class_num]).astype("float32")
-        label = (class_num / 2) * numpy.ones(batch_size).astype("int32")
-        self.inputs = {'X': X, 'label': label}
+
+        labels = numpy.random.randint(0, class_num, batch_size, dtype="int32")
+
+        self.inputs = {"X": X, "label": labels}
         Y = []
         for i in range(0, batch_size):
-            Y.append(-numpy.log(X[i][label[i]]))
-        self.outputs = {'Y': numpy.array(Y).astype("float32")}
+            Y.append(-numpy.log(X[i][labels[i]]))
+        self.outputs = {"Y": numpy.array(Y).astype("float32")}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(["X"], "Y")
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cost_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cost_op.py
deleted file mode 100644
index f7b9f54a91..0000000000
--- a/python/paddle/v2/framework/tests/test_softmax_with_cost_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import unittest
-
-import numpy as np
-
-from gradient_checker import GradientChecker, create_op
-from op_test_util import OpTestMeta
-
-
-class TestSoftmaxWithLossOp(unittest.TestCase):
-    __metaclass__ = OpTestMeta
-
-    def setUp(self):
-        pass
-
-
-class SoftmaxWithLossGradOpTest(GradientChecker):
-    def test_softmax(self):
-        pass
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
new file mode 100644
index 0000000000..6116110569
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -0,0 +1,39 @@
+import unittest
+import numpy as np
+import pdb
+
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+
+class TestSoftmaxWithCrossEntropyOp(OpTest):
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+
+        MAX_BATCH_SIZE = 23
+        MAX_CLASS_NUM = 255
+
+        batch_size = np.random.randint(1, MAX_BATCH_SIZE, 1)[0]
+        class_num = np.random.randint(2, MAX_CLASS_NUM, 1)[0]
+
+        logits = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        labels = np.random.randint(0, class_num, batch_size, dtype="int32")
+
+        cross_entropy = [
+            -np.log(softmax[i][labels[i]]) for i in range(softmax.shape[0])
+        ]
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {"Loss": cross_entropy}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()

From efa4526c52fbd80f0f0c9f135f1aabf438cbcf69 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Wed, 13 Sep 2017 18:01:12 +0800
Subject: [PATCH 05/51] finish implementation and fix unittest.

---
 paddle/operators/softmax_op.h                 |  2 -
 .../softmax_with_cross_entropy_op.cc          | 71 +++++++++----------
 .../softmax_with_cross_entropy_op.cu          |  7 +-
 .../operators/softmax_with_cross_entropy_op.h | 30 ++++++--
 paddle/pybind/pybind.cc                       |  2 +-
 python/paddle/v2/framework/tests/op_test.py   | 22 +++---
 .../test_softmax_with_cross_entropy_op.py     | 12 ++--
 7 files changed, 77 insertions(+), 69 deletions(-)

diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 6d14542a72..68d05fc215 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -43,8 +43,6 @@ template <typename Place, typename T>
 class SoftmaxGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();
-
     auto Y = context.Input<Tensor>("Y");
     auto dY = context.Input<Tensor>(framework::GradVarName("Y"));
     auto dX = context.Output<Tensor>(framework::GradVarName("X"));
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index b4aa9aab4b..fd75494ff8 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -17,31 +17,16 @@
 namespace paddle {
 namespace operators {
 
-class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto logits = ctx.Input<Tensor>("Logits");
-    PADDLE_ENFORCE(
-        logits->dims().size() == 2UL,
-        "The input of softmax_with_cross_entropy should be a 2-d tensor.");
-    PADDLE_ENFORCE(ctx.Input<Tensor>("Label")->dims().size() == 1UL,
-                   "The label should be a 1-d tensor.");
-    ctx.Output<Tensor>("Label")->Resize({logits->dims()[0]});
-  }
-};
-
 class SoftmaxWithCrossEntropyOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxWithCrossEntropyOpMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
+                                 framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Logits",
              "The unscaled log probabilities which is a 2-D tensor<float> with"
-             "shape [N x K]. N is the batch_size, and K is the class number.");
+             "shape [N x K]. N is the batch_size, and K is the class number.")
+        .NotInGradient();
     AddInput("Label", "The ground truth. A 1-D tensor<int> with shape N.");
     AddOutput("Softmax",
               "Store the outputs of softmax function, "
@@ -70,22 +55,34 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Loss"),
-                            "Input(Loss) should be not null.");
+  void InferShape(const framework::InferShapeContext& ctx) const override {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Loss")),
-                            "Input(Loss@GRAD) should be not null.");
-    PADDLE_ENFORCE_EQ(
-        ctx.Input<Tensor>("Logits")->dims(),
-        ctx.Input<Tensor>(framework::GradVarName("Logits"))->dims(),
-        "Input(Logits) and its gradients should have a same shape.");
-    PADDLE_ENFORCE_EQ(
-        ctx.Input<Tensor>("Logits")->dims(),
-        ctx.Input<Tensor>(framework::GradVarName("Logits"))->dims(),
-        "Input(Logits) and its gradients should have a same shape.");
-
+                            "Input(Loss@Grad) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Softmax"),
+                            "Input(Softmax) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
                             "Input(Lable) should be not null.");
+
+    ctx.Output<Tensor>(framework::GradVarName("Logits"))
+        ->Resize(ctx.Input<Tensor>("Softmax")->dims());
+  }
+};
+
+class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    const Tensor* logits = ctx.Input<Tensor>("Logits");
+    PADDLE_ENFORCE(
+        logits->dims().size() == 2UL,
+        "The input of softmax_with_cross_entropy should be a 2-d tensor.");
+    PADDLE_ENFORCE(ctx.Input<Tensor>("Label")->dims().size() == 1UL,
+                   "The label should be a 1-d tensor.");
+
+    ctx.Output<Tensor>("Softmax")->Resize(logits->dims());
+    ctx.Output<Tensor>("Loss")->Resize({logits->dims()[0], 1});
   }
 };
 
@@ -98,9 +95,7 @@ REGISTER_OP(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
             ops::SoftmaxWithCrossEntropyOpMaker,
             softmax_with_cross_entropy_grad,
             ops::SoftmaxWithCrossEntropyOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    softmax_with_cross_entropy,
-    ops::SoftmaxWithCrossEntropyKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    softmax_with_cross_entropy_grad,
-    ops::SoftmaxWithCrossEntropyGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
+                       ops::SoftmaxWithCrossEntropyKernel<float>);
+REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
+                       ops::SoftmaxWithCrossEntropyGradKernel<float>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index c9d47cc4aa..922bb19d4d 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -17,9 +17,4 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
-    softmax_with_cross_entropy,
-    ops::SoftmaxWithCrossEntropyKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    softmax_with_cross_entropy_grad,
-    ops::SoftmaxWithCrossEntropyGradKernel<paddle::platform::GPUPlace, float>);
+// TODO(caoying) add GPU kernel
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index 4c019a7599..e147cdb815 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -26,20 +26,24 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename T>
 class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(platform::is_cpu_place(place),
+                   "This kernel only runs on CPU.");
+
     // Calculate ths softmax outputs.
     const Tensor* logits = context.Input<Tensor>("Logits");
     Tensor* softmax = context.Output<Tensor>("Softmax");
-    // allocate memory on device.
     softmax->mutable_data<T>(context.GetPlace());
-    math::SoftmaxFunctor<Place, T>()(logits, softmax, context);
+
+    math::SoftmaxFunctor<platform::CPUPlace, T>()(logits, softmax, context);
 
     // Calculate the cross entropy loss based on hard labels.
     T* softmax_out = softmax->data<T>();
-    const int* label_data = context.Input<Tensor>("label")->data<int>();
+    const int* label_data = context.Input<Tensor>("Label")->data<int>();
 
     Tensor* loss = context.Output<Tensor>("Loss");
     loss->mutable_data<T>(context.GetPlace());
@@ -55,10 +59,24 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
   }
 };
 
-template <typename Place, typename T>
+template <typename T>
 class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {}
+  void Compute(const framework::ExecutionContext& context) const override {
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
+    T* logit_grad_data = logit_grad->data<T>();
+
+    const int batch_size = logit_grad->dims()[0];
+    const int class_num = logit_grad->dims()[1];
+
+    const int* label_data = context.Input<Tensor>("Label")->data<int>();
+    for (int i = 0; i < batch_size; ++i) {
+      int index = i * class_num + label_data[i];
+      logit_grad_data[index] -= .1;
+    }
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 93792c568e..cb361596ae 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -39,7 +39,6 @@ USE_OP(elementwise_mul);
 USE_OP(mean);
 USE_OP(sigmoid);
 USE_OP(softmax);
-USE_OP(softmax_with_cross_entropy);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
 USE_NO_KERNEL_OP(recurrent);
@@ -53,6 +52,7 @@ USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
 USE_CPU_ONLY_OP(concat);
+USE_CPU_ONLY_OP(softmax_with_cross_entropy);
 USE_OP(top_k);
 USE_OP(squared_l2_distance);
 USE_OP(sum);
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 4fec4c9109..f5f11aa93d 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -166,7 +166,7 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place,
 
 
 class OpTest(unittest.TestCase):
-    def check_output_with_place(self, place):
+    def check_output_with_place(self, place, atol):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
         op_attrs = self.attrs if hasattr(self, "attrs") else dict()
@@ -188,22 +188,23 @@ class OpTest(unittest.TestCase):
                     expect = sub_out[sub_out_name]
                     self.assertTrue(
                         np.allclose(
-                            actual, expect, atol=1e-05),
-                        "output name: " + out_name + "has diff")
+                            actual, expect, atol=atol),
+                        "output name: " + out_name + " has diff.")
             else:
                 actual = np.array(self.scope.find_var(out_name).get_tensor())
                 expect = self.outputs[out_name]
+
                 self.assertTrue(
                     np.allclose(
-                        actual, expect, atol=1e-05),
-                    "output name: " + out_name + "has diff")
+                        actual, expect, atol=atol),
+                    "output name: " + out_name + " has diff.")
 
-    def check_output(self):
+    def check_output(self, atol=1e-5):
         places = [core.CPUPlace()]
         if core.is_compile_gpu():
             places.append(core.GPUPlace(0))
         for place in places:
-            self.check_output_with_place(place)
+            self.check_output_with_place(place, atol)
 
     def __assert_is_close(self, numeric_grads, analytic_grads, names,
                           max_relative_error, msg_prefix):
@@ -217,9 +218,10 @@ class OpTest(unittest.TestCase):
 
             def err_msg():
                 offset = np.argmax(diff_mat > max_relative_error)
-                return "%s Variable %s max gradient diff %f over limit %f, the first " \
-                  "error element is %d" % (
-                   msg_prefix, name, max_diff, max_relative_error, offset)
+                return ("%s Variable %s max gradient diff %f over limit %f, "
+                        "the first error element is %d") % (
+                            msg_prefix, name, max_diff, max_relative_error,
+                            offset)
 
             self.assertLessEqual(max_diff, max_relative_error, err_msg())
 
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
index 6116110569..4e35c063b9 100644
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -11,7 +11,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         self.op_type = "softmax_with_cross_entropy"
 
         MAX_BATCH_SIZE = 23
-        MAX_CLASS_NUM = 255
+        MAX_CLASS_NUM = 10
 
         batch_size = np.random.randint(1, MAX_BATCH_SIZE, 1)[0]
         class_num = np.random.randint(2, MAX_CLASS_NUM, 1)[0]
@@ -21,18 +21,18 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         labels = np.random.randint(0, class_num, batch_size, dtype="int32")
 
-        cross_entropy = [
-            -np.log(softmax[i][labels[i]]) for i in range(softmax.shape[0])
-        ]
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax[i][labels[i]])] for i in range(softmax.shape[0])],
+            dtype="float32")
 
         self.inputs = {"Logits": logits, "Label": labels}
-        self.outputs = {"Loss": cross_entropy}
+        self.outputs = {"Softmax": softmax, "Loss": cross_entropy}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        pass
+        self.check_grad(["Logits"], "Loss")
 
 
 if __name__ == "__main__":

From 8f8ea005fecd911e913ff728ed37ecb990dfbbca Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Fri, 15 Sep 2017 14:51:04 +0800
Subject: [PATCH 06/51] fix implementations.

---
 paddle/operators/math/utils.h                 | 42 ++++++++
 paddle/operators/onehot_cross_entropy_op.cu   | 20 +---
 .../softmax_with_cross_entropy_op.cc          | 12 +--
 .../softmax_with_cross_entropy_op.cu          | 97 ++++++++++++++++++-
 .../operators/softmax_with_cross_entropy_op.h |  7 +-
 .../framework/tests/test_cross_entropy_op.py  |  1 -
 .../test_softmax_with_cross_entropy_op.py     |  7 +-
 7 files changed, 151 insertions(+), 35 deletions(-)
 create mode 100644 paddle/operators/math/utils.h

diff --git a/paddle/operators/math/utils.h b/paddle/operators/math/utils.h
new file mode 100644
index 0000000000..1e72c8e0ca
--- /dev/null
+++ b/paddle/operators/math/utils.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+T HOSTDEVICE tolerable_value(const T x) {
+  PADDLE_ASSERT(std::is_floating_point<T>::value);
+
+  const T kApproInf = 1e20;
+
+  if (x == INFINITY) {
+    return kApproInf;
+  }
+
+  if (x == -INFINITY) {
+    return -kApproInf;
+  }
+
+  return x;
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/onehot_cross_entropy_op.cu b/paddle/operators/onehot_cross_entropy_op.cu
index d999bfce58..f8ed9680e7 100644
--- a/paddle/operators/onehot_cross_entropy_op.cu
+++ b/paddle/operators/onehot_cross_entropy_op.cu
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/utils.h"
 #include "paddle/platform/assert.h"
 
 namespace paddle {
@@ -20,20 +21,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T>
-__host__ __device__ T clipping_log(const T x) {
-  PADDLE_ASSERT(std::is_floating_point<T>::value);
-  const T kApproInf = 1e20;
-  T v = log(x);
-  if (v == INFINITY) {
-    return kApproInf;
-  }
-  if (v == -INFINITY) {
-    return -kApproInf;
-  }
-  return v;
-}
-
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
                                    const int N, const int D) {
@@ -42,7 +29,7 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -clipping_log(X[i * D + label[i]]);
+    Y[i] = -math::tolerable_value(log(X[i * D + label[i]]));
   }
 }
 
@@ -73,7 +60,7 @@ class OnehotCrossEntropyOpCUDAKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "This kernel only runs on GPU device.");
 
     auto X = ctx.Input<Tensor>("X");
     const T* Xdata = X->data<T>();
@@ -86,6 +73,7 @@ class OnehotCrossEntropyOpCUDAKernel : public framework::OpKernel {
     int D = X->dims()[1];
     int block = 512;
     int grid = (N + block - 1) / block;
+
     // TODO(qingqing) launch kernel on specified stream
     // base on ExecutionContext.
     CrossEntropyKernel<T><<<grid, block>>>(Ydata, Xdata, label_data, N, D);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index fd75494ff8..a0941bb624 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -32,7 +32,7 @@ class SoftmaxWithCrossEntropyOpMaker
               "Store the outputs of softmax function, "
               "which will be used in backward calculation.")
         .AsIntermediate();
-    AddOutput("Loss", "A 1-D tensor<float> with shape N.");
+    AddOutput("Out", "A 1-D tensor<float> with shape N.");
     AddComment(R"DOC(
 Cross entropy loss with softmax are used as the output layer extensively. This
 operator computes the softmax normalized values for each row of the input
@@ -56,14 +56,14 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Loss")),
-                            "Input(Loss@Grad) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@Grad) should not be null");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Softmax"),
                             "Input(Softmax) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
                             "Input(Lable) should be not null.");
 
-    ctx.Output<Tensor>(framework::GradVarName("Logits"))
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("Logits"))
         ->Resize(ctx.Input<Tensor>("Softmax")->dims());
   }
 };
@@ -81,8 +81,8 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx.Input<Tensor>("Label")->dims().size() == 1UL,
                    "The label should be a 1-d tensor.");
 
-    ctx.Output<Tensor>("Softmax")->Resize(logits->dims());
-    ctx.Output<Tensor>("Loss")->Resize({logits->dims()[0], 1});
+    ctx.Output<framework::LoDTensor>("Softmax")->Resize(logits->dims());
+    ctx.Output<framework::LoDTensor>("Out")->Resize({logits->dims()[0], 1});
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index 922bb19d4d..5af6a521a8 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -13,8 +13,97 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "softmax_with_cross_entropy_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/softmax_function.h"
+#include "paddle/operators/math/utils.h"
 
-namespace ops = paddle::operators;
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void CrossEntropyKernel(T* out, const T* softmax_out,
+                                   const int* label, const int batch_size,
+                                   const int class_num) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= batch_size) return;
+  PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
+  out[i] = -math::tolerable_value(log(softmax_out[i * class_num + label[i]]));
+}
+
+template <typename T>
+__global__ void CrossEntropyWithSoftmaxGradKernel(T* softmax_out,
+                                                  const int* label,
+                                                  const int batch_size,
+                                                  const int class_num) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= batch_size) return;
+
+  PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
+  softmax_out[i * class_num + label[i]] -= 1.;
+}
+
+template <typename T>
+class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    // Calculate ths softmax outputs.
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    Tensor* softmax = context.Output<Tensor>("Softmax");
+    softmax->mutable_data<T>(context.GetPlace());
+    math::SoftmaxFunctor<platform::GPUPlace, T>()(logits, softmax, context);
+    T* softmax_out = softmax->data<T>();
+
+    // Calculate the cross entropy loss based on hard labels.
+    const int* label_data = context.Input<Tensor>("Label")->data<int>();
+    Tensor* loss = context.Output<Tensor>("Out");
+    loss->mutable_data<T>(context.GetPlace());
+    T* loss_data = loss->data<T>();
+
+    const int batch_size = logits->dims()[0];
+    const int class_num = logits->dims()[1];
+    int block = 512;
+    int grid = (batch_size + block - 1) / block;
 
-// TODO(caoying) add GPU kernel
+    CrossEntropyKernel<T><<<grid, block>>>(loss_data, softmax_out, label_data,
+                                           batch_size, class_num);
+  }
+};
+
+template <typename T>
+class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
+    T* logit_grad_data = logit_grad->data<T>();
+
+    const int batch_size = logit_grad->dims()[0];
+    const int class_num = logit_grad->dims()[1];
+
+    const int* label_data = context.Input<Tensor>("Label")->data<int>();
+
+    const int block = 512;
+    const int grid = (batch_size + block - 1) / block;
+
+    CrossEntropyWithSoftmaxGradKernel<T><<<grid, block>>>(
+        logit_grad_data, label_data, batch_size, class_num);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
+                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
+                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index e147cdb815..38b92a0bcd 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -30,8 +30,7 @@ template <typename T>
 class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(platform::is_cpu_place(place),
+    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
                    "This kernel only runs on CPU.");
 
     // Calculate ths softmax outputs.
@@ -45,7 +44,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
     T* softmax_out = softmax->data<T>();
     const int* label_data = context.Input<Tensor>("Label")->data<int>();
 
-    Tensor* loss = context.Output<Tensor>("Loss");
+    Tensor* loss = context.Output<Tensor>("Out");
     loss->mutable_data<T>(context.GetPlace());
     T* loss_data = loss->data<T>();
 
@@ -74,7 +73,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel {
     const int* label_data = context.Input<Tensor>("Label")->data<int>();
     for (int i = 0; i < batch_size; ++i) {
       int index = i * class_num + label_data[i];
-      logit_grad_data[index] -= .1;
+      logit_grad_data[index] -= 1.;
     }
   }
 };
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 5e06525d61..253e7b8a24 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -1,7 +1,6 @@
 import unittest
 import numpy
 from op_test import OpTest
-import pdb
 
 
 class TestCrossEntropy(OpTest):
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
index 4e35c063b9..e965dd0482 100644
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -1,6 +1,5 @@
 import unittest
 import numpy as np
-import pdb
 
 from op_test import OpTest
 from test_softmax_op import stable_softmax
@@ -11,7 +10,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         self.op_type = "softmax_with_cross_entropy"
 
         MAX_BATCH_SIZE = 23
-        MAX_CLASS_NUM = 10
+        MAX_CLASS_NUM = 17
 
         batch_size = np.random.randint(1, MAX_BATCH_SIZE, 1)[0]
         class_num = np.random.randint(2, MAX_CLASS_NUM, 1)[0]
@@ -26,13 +25,13 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
             dtype="float32")
 
         self.inputs = {"Logits": logits, "Label": labels}
-        self.outputs = {"Softmax": softmax, "Loss": cross_entropy}
+        self.outputs = {"Softmax": softmax, "Out": cross_entropy}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss")
+        self.check_grad(["Logits"], "Out", max_relative_error=0.05)
 
 
 if __name__ == "__main__":

From f1d5fb3b9a6201f3eaf92b12d84b3e3727a3a575 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Thu, 21 Sep 2017 17:47:52 +0800
Subject: [PATCH 07/51] support soft labels.

---
 paddle/operators/math/CMakeLists.txt          |  4 +-
 .../math/{softmax_function.cc => softmax.cc}  |  2 +-
 .../math/{softmax_function.cu => softmax.cu}  |  2 +-
 .../math/{softmax_function.h => softmax.h}    |  0
 paddle/operators/math/utils.h                 | 42 -----------
 paddle/operators/softmax_op.h                 |  2 +-
 .../softmax_with_cross_entropy_op.cc          | 75 +++++++++++++------
 .../softmax_with_cross_entropy_op.cu          | 22 +++---
 .../operators/softmax_with_cross_entropy_op.h |  8 +-
 .../test_softmax_with_cross_entropy_op.py     |  4 +-
 10 files changed, 74 insertions(+), 87 deletions(-)
 rename paddle/operators/math/{softmax_function.cc => softmax.cc} (93%)
 rename paddle/operators/math/{softmax_function.cu => softmax.cu} (94%)
 rename paddle/operators/math/{softmax_function.h => softmax.h} (100%)
 delete mode 100644 paddle/operators/math/utils.h

diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 832a954e3a..074ca47d7f 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,12 +1,12 @@
 if(WITH_GPU)
     nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
       im2col.cu DEPS cblas device_context operator)
-    nv_library(softmax_function SRCS softmax_function.cc softmax_function.cu
+    nv_library(softmax_function SRCS softmax.cc softmax.cu
       DEPS operator)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc
       DEPS cblas device_context operator)
-    cc_library(softmax_function SRCS softmax_function.cc DEPS operator)
+    cc_library(softmax_function SRCS softmax.cc DEPS operator)
 endif()
 
 nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/softmax_function.cc b/paddle/operators/math/softmax.cc
similarity index 93%
rename from paddle/operators/math/softmax_function.cc
rename to paddle/operators/math/softmax.cc
index cd46ed96ca..ac9f3c4bf6 100644
--- a/paddle/operators/math/softmax_function.cc
+++ b/paddle/operators/math/softmax.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/math/softmax_function.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/softmax_function.cu b/paddle/operators/math/softmax.cu
similarity index 94%
rename from paddle/operators/math/softmax_function.cu
rename to paddle/operators/math/softmax.cu
index 486697a161..4c3df0550e 100644
--- a/paddle/operators/math/softmax_function.cu
+++ b/paddle/operators/math/softmax.cu
@@ -14,7 +14,7 @@
 
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/math/softmax_function.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/softmax_function.h b/paddle/operators/math/softmax.h
similarity index 100%
rename from paddle/operators/math/softmax_function.h
rename to paddle/operators/math/softmax.h
diff --git a/paddle/operators/math/utils.h b/paddle/operators/math/utils.h
deleted file mode 100644
index 1e72c8e0ca..0000000000
--- a/paddle/operators/math/utils.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include "paddle/platform/assert.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-T HOSTDEVICE tolerable_value(const T x) {
-  PADDLE_ASSERT(std::is_floating_point<T>::value);
-
-  const T kApproInf = 1e20;
-
-  if (x == INFINITY) {
-    return kApproInf;
-  }
-
-  if (x == -INFINITY) {
-    return -kApproInf;
-  }
-
-  return x;
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 68d05fc215..18494e470a 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax_function.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index a0941bb624..3dd21279ad 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -23,16 +23,32 @@ class SoftmaxWithCrossEntropyOpMaker
   SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
                                  framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
+    //(TODO caoying) replace int with boolean
+    AddAttr<int>("soft_label",
+                 "(int, default 0), A flag to indicate whether to interpretate "
+                 "the given labels as soft labels.")
+        .SetDefault(0);
     AddInput("Logits",
-             "The unscaled log probabilities which is a 2-D tensor<float> with"
-             "shape [N x K]. N is the batch_size, and K is the class number.")
+             "(Tensor, default Tensor<float>), The unscaled log probabilities "
+             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
+             "and K is the class number.")
         .NotInGradient();
-    AddInput("Label", "The ground truth. A 1-D tensor<int> with shape N.");
-    AddOutput("Softmax",
-              "Store the outputs of softmax function, "
-              "which will be used in backward calculation.")
+    AddInput(
+        "Label",
+        "(Tensor, default Tensor<int>), The ground truth which is "
+        "a 1-D or 2-D tensor. "
+        "If soft_label is set to 0, Label is a Tensor<int> with shape [N x 1]. "
+        "If soft_label is set to 1, Label is a Tensor<float/double> "
+        "with shape [N x K].");
+    AddOutput(
+        "Softmax",
+        "(Tensor, default Tensor<float>), A 2-D tensor with shape [N x K]. "
+        "The outputs value of softmax activation by given the input batch, "
+        "which will be used in backward calculation.")
         .AsIntermediate();
-    AddOutput("Out", "A 1-D tensor<float> with shape N.");
+    AddOutput("Loss",
+              "(Tensor, default Tensor<float>), A 1-D tensor. The cross "
+              "entropy loss with shape [N x 1].");
     AddComment(R"DOC(
 Cross entropy loss with softmax are used as the output layer extensively. This
 operator computes the softmax normalized values for each row of the input
@@ -46,25 +62,18 @@ which will produce incorrect results.
 This operators expects mutually exclusive hard labels, each sample in a batch
 is in exactly one class with probabilities 1. Each sample in the batch with one
 and only one label.
-)DOC");
-  }
-};
 
-class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+Equation:
 
- protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@Grad) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Softmax"),
-                            "Input(Softmax) should be not null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Lable) should be not null.");
+1) hard label (one-hot label)
 
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("Logits"))
-        ->Resize(ctx.Input<Tensor>("Softmax")->dims());
+Loss_j = -\text{Logit}_{Label_j} + \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right), j = 1, ..., K
+
+2) soft label (a distribution over all classes)
+
+Loss_j = -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right), j = 1,...,K
+
+)DOC");
   }
 };
 
@@ -82,7 +91,25 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
                    "The label should be a 1-d tensor.");
 
     ctx.Output<framework::LoDTensor>("Softmax")->Resize(logits->dims());
-    ctx.Output<framework::LoDTensor>("Out")->Resize({logits->dims()[0], 1});
+    ctx.Output<framework::LoDTensor>("Loss")->Resize({logits->dims()[0], 1});
+  }
+};
+
+class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Loss")),
+                            "Input(Loss@Grad) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Softmax"),
+                            "Input(Softmax) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Lable) should be not null.");
+
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("Logits"))
+        ->Resize(ctx.Input<Tensor>("Softmax")->dims());
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index 5af6a521a8..68bb85fa8a 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -13,9 +13,10 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
+
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax_function.h"
-#include "paddle/operators/math/utils.h"
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
@@ -27,9 +28,10 @@ __global__ void CrossEntropyKernel(T* out, const T* softmax_out,
                                    const int* label, const int batch_size,
                                    const int class_num) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= batch_size) return;
-  PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
-  out[i] = -math::tolerable_value(log(softmax_out[i * class_num + label[i]]));
+  if (i < batch_size) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
+    out[i] = -tolerable_value(std::log(softmax_out[i * class_num + label[i]]));
+  }
 }
 
 template <typename T>
@@ -38,10 +40,10 @@ __global__ void CrossEntropyWithSoftmaxGradKernel(T* softmax_out,
                                                   const int batch_size,
                                                   const int class_num) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= batch_size) return;
-
-  PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
-  softmax_out[i * class_num + label[i]] -= 1.;
+  if (i < batch_size) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
+    softmax_out[i * class_num + label[i]] -= 1.;
+  }
 }
 
 template <typename T>
@@ -60,7 +62,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
 
     // Calculate the cross entropy loss based on hard labels.
     const int* label_data = context.Input<Tensor>("Label")->data<int>();
-    Tensor* loss = context.Output<Tensor>("Out");
+    Tensor* loss = context.Output<Tensor>("Loss");
     loss->mutable_data<T>(context.GetPlace());
     T* loss_data = loss->data<T>();
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index 38b92a0bcd..0ad48dae2c 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -15,8 +15,8 @@
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax_function.h"
-#include "paddle/operators/math/utils.h"
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
@@ -44,7 +44,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
     T* softmax_out = softmax->data<T>();
     const int* label_data = context.Input<Tensor>("Label")->data<int>();
 
-    Tensor* loss = context.Output<Tensor>("Out");
+    Tensor* loss = context.Output<Tensor>("Loss");
     loss->mutable_data<T>(context.GetPlace());
     T* loss_data = loss->data<T>();
 
@@ -53,7 +53,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
 
     for (int i = 0; i < batch_size; ++i) {
       int index = i * class_num + label_data[i];
-      loss_data[i] = -math::tolerable_value(std::log(softmax_out[index]));
+      loss_data[i] = -tolerable_value(std::log(softmax_out[index]));
     }
   }
 };
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
index e965dd0482..9c9ee77b73 100644
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -25,13 +25,13 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
             dtype="float32")
 
         self.inputs = {"Logits": logits, "Label": labels}
-        self.outputs = {"Softmax": softmax, "Out": cross_entropy}
+        self.outputs = {"Softmax": softmax, "Loss": cross_entropy}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Out", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
 
 
 if __name__ == "__main__":

From 859dba591baaac50656fe3e25ab5a50a17445fc5 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 21 Sep 2017 20:13:07 -0700
Subject: [PATCH 08/51] Init commit

---
 paddle/pybind/pybind.cc | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index c7009a604f..90b995decb 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -315,6 +315,23 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("is_compile_gpu", IsCompileGPU);
 
+  py::class_<ProgramDesc>(m, "ProgramDesc", "")
+      .def_static("instance", [] { return &GetProgramDesc(); })
+      .def("append_block", [](ProgramDesc &self) {
+        auto desc = self.mutable_blocks()->Add();
+        desc->set_idx(self.mutable_blocks()->size() - 1);
+        return desc;
+      });
+  py::class_<BlockDesc>(m, "BlockDesc", "")
+      .def("idx", [](BlockDesc &self) { return self.idx(); })
+      .def("set_parent",
+           [](BlockDesc &self, int32_t idx) { self.set_parent_idx(idx); })
+      .def("parent", [](BlockDesc &self) { return self.parent_idx(); });
+
+  py::class_<VarDesc>(m, "VarDesc", "");
+
+  py::class_<OpDesc>(m, "OpDesc", "");
+
   return m.ptr();
 }
 }  // namespace framework

From 7788b4605500d22ac31359115b3d341faa610080 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 21 Sep 2017 21:13:14 -0700
Subject: [PATCH 09/51] Expose VarDesc interface

---
 paddle/pybind/pybind.cc | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 90b995decb..e1f7bc8672 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -328,7 +328,31 @@ All parameter, weight, gradient are variables in Paddle.
            [](BlockDesc &self, int32_t idx) { self.set_parent_idx(idx); })
       .def("parent", [](BlockDesc &self) { return self.parent_idx(); });
 
-  py::class_<VarDesc>(m, "VarDesc", "");
+  py::class_<VarDesc>(m, "VarDesc", "")
+      .def(py::init<>())
+      .def("set_name",
+           [](VarDesc &self, const std::string &name) { self.set_name(name); })
+      .def("set_shape",
+           [](VarDesc &self, const std::vector<int64_t> &dims) {
+             LoDTensorDesc *lod_tensor_desc = self.mutable_lod_tensor();
+             for (const int64_t &i : dims) {
+               lod_tensor_desc->add_dims(i);
+             }
+           })
+      .def("set_data_type",
+           [](VarDesc &self, int type_id) {
+             LoDTensorDesc *lod_tensor_desc = self.mutable_lod_tensor();
+             lod_tensor_desc->set_data_type(static_cast<DataType>(type_id));
+           })
+      .def("shape", [](VarDesc &self) {
+        const LoDTensorDesc &lod_tensor_desc = self.lod_tensor();
+        int rank = lod_tensor_desc.dims_size();
+        std::vector<int64_t> res(rank);
+        for (int i = 0; i < rank; ++i) {
+          res[i] = lod_tensor_desc.dims(i);
+        }
+        return res;
+      });
 
   py::class_<OpDesc>(m, "OpDesc", "");
 

From 70f398e2074b84701dc9b1b16f518ed9b9b16b62 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 21 Sep 2017 21:17:47 -0700
Subject: [PATCH 10/51] Update

---
 paddle/framework/attribute.cc                 |  3 +
 paddle/pybind/pybind.cc                       | 74 ++++++++++++++++---
 .../v2/framework/tests/test_protobuf_descs.py | 16 ++++
 3 files changed, 83 insertions(+), 10 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_protobuf_descs.py

diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index 159ed03b92..0a305e8a8c 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -24,6 +24,9 @@ static ProgramDesc* g_program_desc = nullptr;
 ProgramDesc& GetProgramDesc() {
   if (g_program_desc == nullptr) {
     g_program_desc = new ProgramDesc();
+    auto root_block = g_program_desc->mutable_blocks()->Add();
+    root_block->set_idx(0);
+    root_block->set_parent_idx(-1);
   }
   return *g_program_desc;
 }
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 90b995decb..835ea85aa1 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -316,21 +316,75 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compile_gpu", IsCompileGPU);
 
   py::class_<ProgramDesc>(m, "ProgramDesc", "")
-      .def_static("instance", [] { return &GetProgramDesc(); })
-      .def("append_block", [](ProgramDesc &self) {
-        auto desc = self.mutable_blocks()->Add();
-        desc->set_idx(self.mutable_blocks()->size() - 1);
-        return desc;
-      });
+      .def_static("instance",
+                  [] { return &GetProgramDesc(); },
+                  py::return_value_policy::reference)
+      .def("append_block",
+           [](ProgramDesc &self, BlockDesc &parent) {
+             auto desc = self.mutable_blocks()->Add();
+             desc->set_idx(self.mutable_blocks()->size() - 1);
+             desc->set_parent_idx(parent.idx());
+             return desc;
+           })
+      .def("root_block",
+           [](ProgramDesc &self) { return self.mutable_blocks()[0]; });
   py::class_<BlockDesc>(m, "BlockDesc", "")
       .def("idx", [](BlockDesc &self) { return self.idx(); })
-      .def("set_parent",
-           [](BlockDesc &self, int32_t idx) { self.set_parent_idx(idx); })
-      .def("parent", [](BlockDesc &self) { return self.parent_idx(); });
+      .def("parent", [](BlockDesc &self) { return self.parent_idx(); })
+      .def("append_op",
+           [](BlockDesc &self) { return self.mutable_ops()->Add(); });
 
   py::class_<VarDesc>(m, "VarDesc", "");
 
-  py::class_<OpDesc>(m, "OpDesc", "");
+  auto op_desc_set_var = [](OpDesc::Var *var,
+                            const std::string &parameter,
+                            const std::vector<std::string> &arguments) {
+    var->set_parameter(parameter);
+    auto args = var->mutable_arguments();
+    args->Reserve(static_cast<int>(arguments.size()));
+    for (auto &arg : arguments) {
+      *args->Add() = arg;
+    }
+  };
+
+  auto op_desc_set_attr = [](OpDesc &desc, const std::string &name) {
+    auto attr = desc.mutable_attrs()->Add();
+    attr->set_name(name);
+    return attr;
+  };
+
+  py::class_<OpDesc>(m, "OpDesc", "")
+      .def("type", [](OpDesc &op) { return op.type(); })
+      .def("set_input",
+           [op_desc_set_var](OpDesc &self,
+                             const std::string &parameter,
+                             const std::vector<std::string> &arguments) {
+             auto ipt = self.mutable_inputs()->Add();
+             op_desc_set_var(ipt, parameter, arguments);
+           })
+      .def("input_names",
+           [](OpDesc &self) {
+             std::vector<std::string> ret_val;
+             ret_val.reserve(static_cast<size_t>(self.inputs().size()));
+             std::transform(
+                 self.inputs().begin(),
+                 self.inputs().end(),
+                 std::back_inserter(ret_val),
+                 [](const OpDesc::Var &var) { return var.parameter(); });
+             return ret_val;
+           })
+      .def("__str__", [](OpDesc &self) { return self.DebugString(); })
+      .def("set_output",
+           [op_desc_set_var](OpDesc &self,
+                             const std::string &parameter,
+                             const std::vector<std::string> &arguments) {
+             auto opt = self.mutable_outputs()->Add();
+             op_desc_set_var(opt, parameter, arguments);
+           })
+      .def("set_attr",
+           [op_desc_set_attr](OpDesc &self, const std::string &name, int i) {
+             op_desc_set_attr(self, name)->set_i(i);
+           });
 
   return m.ptr();
 }
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
new file mode 100644
index 0000000000..945610ff45
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -0,0 +1,16 @@
+import unittest
+import paddle.v2.framework.core as core
+
+
+class TestProgramDesc(unittest.TestCase):
+    def test_instance(self):
+        program_desc = core.ProgramDesc.instance()
+        self.assertIsNotNone(program_desc)
+        del program_desc
+        program_desc = core.ProgramDesc.instance()
+        self.assertIsNotNone(program_desc)
+        del program_desc
+
+
+if __name__ == '__main__':
+    unittest.main()

From 332369ca5c72fbe88f2504f71285b25247cc966e Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 21 Sep 2017 21:32:42 -0700
Subject: [PATCH 11/51] Add `new_var` for BlockDesc

---
 paddle/pybind/pybind.cc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index cbb7b1cbff..fa10c8e472 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -321,18 +321,19 @@ All parameter, weight, gradient are variables in Paddle.
                   py::return_value_policy::reference)
       .def("append_block",
            [](ProgramDesc &self, BlockDesc &parent) {
-             auto desc = self.mutable_blocks()->Add();
+             auto desc = self.add_blocks();
              desc->set_idx(self.mutable_blocks()->size() - 1);
              desc->set_parent_idx(parent.idx());
              return desc;
            })
       .def("root_block",
            [](ProgramDesc &self) { return self.mutable_blocks()[0]; });
+
   py::class_<BlockDesc>(m, "BlockDesc", "")
       .def("idx", [](BlockDesc &self) { return self.idx(); })
       .def("parent", [](BlockDesc &self) { return self.parent_idx(); })
-      .def("append_op",
-           [](BlockDesc &self) { return self.mutable_ops()->Add(); });
+      .def("append_op", [](BlockDesc &self) { return self.add_ops(); })
+      .def("new_var", [](BlockDesc &self) { return self.add_vars(); });
 
   py::class_<VarDesc>(m, "VarDesc", "")
       .def(py::init<>())
@@ -372,7 +373,7 @@ All parameter, weight, gradient are variables in Paddle.
   };
 
   auto op_desc_set_attr = [](OpDesc &desc, const std::string &name) {
-    auto attr = desc.mutable_attrs()->Add();
+    auto attr = desc.add_attrs();
     attr->set_name(name);
     return attr;
   };
@@ -383,7 +384,7 @@ All parameter, weight, gradient are variables in Paddle.
            [op_desc_set_var](OpDesc &self,
                              const std::string &parameter,
                              const std::vector<std::string> &arguments) {
-             auto ipt = self.mutable_inputs()->Add();
+             auto ipt = self.add_inputs();
              op_desc_set_var(ipt, parameter, arguments);
            })
       .def("input_names",
@@ -402,7 +403,7 @@ All parameter, weight, gradient are variables in Paddle.
            [op_desc_set_var](OpDesc &self,
                              const std::string &parameter,
                              const std::vector<std::string> &arguments) {
-             auto opt = self.mutable_outputs()->Add();
+             auto opt = self.add_outputs();
              op_desc_set_var(opt, parameter, arguments);
            })
       .def("set_attr",

From 618884dd69af0f2e7ea7c0527ec2ba8131ec5a07 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 21 Sep 2017 21:39:42 -0700
Subject: [PATCH 12/51] Complete unittest for ProgramDesc

---
 paddle/pybind/pybind.cc                       | 21 +++++++++++++++----
 .../v2/framework/tests/test_protobuf_descs.py |  9 ++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index cbb7b1cbff..cae3671350 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -319,17 +319,30 @@ All parameter, weight, gradient are variables in Paddle.
       .def_static("instance",
                   [] { return &GetProgramDesc(); },
                   py::return_value_policy::reference)
+      .def_static("__create_program_desc__",
+                  [] {
+                    // Only used for unit-test
+                    auto *prog_desc = new ProgramDesc;
+                    auto *block = prog_desc->mutable_blocks()->Add();
+                    block->set_idx(0);
+                    block->set_parent_idx(-1);
+                    return prog_desc;
+                  })
       .def("append_block",
            [](ProgramDesc &self, BlockDesc &parent) {
-             auto desc = self.mutable_blocks()->Add();
+             auto desc = self.add_blocks();
              desc->set_idx(self.mutable_blocks()->size() - 1);
              desc->set_parent_idx(parent.idx());
              return desc;
-           })
+           },
+           py::return_value_policy::reference)
       .def("root_block",
-           [](ProgramDesc &self) { return self.mutable_blocks()[0]; });
+           [](ProgramDesc &self) { return self.mutable_blocks()->Mutable(0); },
+           py::return_value_policy::reference)
+      .def("__str__", [](ProgramDesc &self) { return self.DebugString(); });
+
   py::class_<BlockDesc>(m, "BlockDesc", "")
-      .def("idx", [](BlockDesc &self) { return self.idx(); })
+      .def("id", [](BlockDesc &self) { return self.idx(); })
       .def("parent", [](BlockDesc &self) { return self.parent_idx(); })
       .def("append_op",
            [](BlockDesc &self) { return self.mutable_ops()->Add(); });
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index 945610ff45..8e94843662 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -9,8 +9,17 @@ class TestProgramDesc(unittest.TestCase):
         del program_desc
         program_desc = core.ProgramDesc.instance()
         self.assertIsNotNone(program_desc)
+        self.assertIsNotNone(program_desc.root_block())
         del program_desc
 
+    def test_append_block(self):
+        prog_desc = core.ProgramDesc.__create_program_desc__()
+        self.assertIsNotNone(prog_desc)
+        block1 = prog_desc.append_block(prog_desc.root_block())
+        block2 = prog_desc.append_block(block1)
+        self.assertEqual(block1.id(), block2.parent())
+        self.assertEqual(prog_desc.root_block().id(), block1.parent())
+
 
 if __name__ == '__main__':
     unittest.main()

From b154c0e51ea1853ab935bf6c86966b0092babe6a Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 22 Sep 2017 10:07:17 -0700
Subject: [PATCH 13/51] Update

---
 paddle/pybind/pybind.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index a6b583821f..543dbb739e 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -344,8 +344,12 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<BlockDesc>(m, "BlockDesc", "")
       .def("id", [](BlockDesc &self) { return self.idx(); })
       .def("parent", [](BlockDesc &self) { return self.parent_idx(); })
-      .def("append_op", [](BlockDesc &self) { return self.add_ops(); })
-      .def("new_var", [](BlockDesc &self) { return self.add_vars(); });
+      .def("append_op",
+           [](BlockDesc &self) { return self.add_ops(); },
+           py::return_value_policy::reference)
+      .def("new_var",
+           [](BlockDesc &self) { return self.add_vars(); },
+           py::return_value_policy::reference);
 
   py::class_<VarDesc>(m, "VarDesc", "")
       .def(py::init<>())

From ee547f6ac984b8880394acceb6fbec856f6a2dde Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 22 Sep 2017 10:37:34 -0700
Subject: [PATCH 14/51] Add unittests

---
 .../v2/framework/tests/test_protobuf_descs.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index 8e94843662..71bdca8765 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -15,10 +15,25 @@ class TestProgramDesc(unittest.TestCase):
     def test_append_block(self):
         prog_desc = core.ProgramDesc.__create_program_desc__()
         self.assertIsNotNone(prog_desc)
-        block1 = prog_desc.append_block(prog_desc.root_block())
+        block_root = prog_desc.root_block()
+        self.assertEqual(block_root.id(), 0)
+        block1 = prog_desc.append_block(block_root)
         block2 = prog_desc.append_block(block1)
         self.assertEqual(block1.id(), block2.parent())
-        self.assertEqual(prog_desc.root_block().id(), block1.parent())
+        self.assertEqual(block_root.id(), block1.parent())
+        block3 = prog_desc.append_block(block_root)
+        self.assertEqual(block3.parent(), block_root.id())
+
+
+class TestVarDesc(unittest.TestCase):
+    def test_shape(self):
+        program_desc = core.ProgramDesc.instance()
+        block = program_desc.root_block()
+        var = block.new_var()
+        src_shape = [3, 2, 10, 8]
+        var.set_shape(src_shape)
+        res_shape = var.shape()
+        self.assertEqual(src_shape, res_shape)
 
 
 if __name__ == '__main__':

From 17d93f4a04fa03517d84d20fd31829a7e02847b4 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 22 Sep 2017 10:38:04 -0700
Subject: [PATCH 15/51] Add Helper for Vector2Repeated

---
 paddle/pybind/pybind.cc | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index a6b583821f..74f5904034 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -53,6 +53,25 @@ bool IsCompileGPU() {
 #endif
 }
 
+template <typename T>
+inline std::vector<T> RepeatedToVector(
+    const google::protobuf::RepeatedField<T> &repeated_field) {
+  std::vector<T> ret;
+  ret.reserve(repeated_field.size());
+  std::copy(
+      repeated_field.begin(), repeated_field.end(), std::back_inserter(ret));
+  return ret;
+}
+
+template <typename T, typename RepeatedField>
+inline void VectorToRepeated(const std::vector<T> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Reserve(vec.size());
+  for (auto &elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of PaddlePaddle");
 
@@ -377,11 +396,7 @@ All parameter, weight, gradient are variables in Paddle.
                             const std::string &parameter,
                             const std::vector<std::string> &arguments) {
     var->set_parameter(parameter);
-    auto args = var->mutable_arguments();
-    args->Reserve(static_cast<int>(arguments.size()));
-    for (auto &arg : arguments) {
-      *args->Add() = arg;
-    }
+    VectorToRepeated(arguments, var->mutable_arguments());
   };
 
   auto op_desc_set_attr = [](OpDesc &desc, const std::string &name) {

From e29003669ff30272070bb8513fb95c2042c305b9 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 22 Sep 2017 11:23:47 -0700
Subject: [PATCH 16/51] Moving protobuf binding code to protobuf module

---
 paddle/pybind/CMakeLists.txt |   3 +-
 paddle/pybind/protobuf.cc    | 136 ++++++++++++++++++++++++++++++++++
 paddle/pybind/protobuf.h     |  54 ++++++++++++++
 paddle/pybind/pybind.cc      | 140 ++---------------------------------
 4 files changed, 197 insertions(+), 136 deletions(-)
 create mode 100644 paddle/pybind/protobuf.cc
 create mode 100644 paddle/pybind/protobuf.h

diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index 4f05406c7f..a1d7483973 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,6 +1,7 @@
 if(WITH_PYTHON)
+  cc_library(proto_bind SRCS protobuf.cc)
   cc_library(paddle_pybind SHARED
     SRCS pybind.cc
-    DEPS pybind python backward
+    DEPS proto_bind pybind python backward
     ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
new file mode 100644
index 0000000000..91f4c7d7c8
--- /dev/null
+++ b/paddle/pybind/protobuf.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pybind/protobuf.h"
+
+namespace paddle {
+namespace framework {
+
+void bind_program_desc(py::module &m) {
+  py::class_<ProgramDesc>(m, "ProgramDesc", "")
+      .def_static("instance",
+                  [] { return &GetProgramDesc(); },
+                  py::return_value_policy::reference)
+      .def_static("__create_program_desc__",
+                  [] {
+                    // Only used for unit-test
+                    auto *prog_desc = new ProgramDesc;
+                    auto *block = prog_desc->mutable_blocks()->Add();
+                    block->set_idx(0);
+                    block->set_parent_idx(-1);
+                    return prog_desc;
+                  })
+      .def("append_block",
+           [](ProgramDesc &self, BlockDesc &parent) {
+             auto desc = self.add_blocks();
+             desc->set_idx(self.mutable_blocks()->size() - 1);
+             desc->set_parent_idx(parent.idx());
+             return desc;
+           },
+           py::return_value_policy::reference)
+      .def("root_block",
+           [](ProgramDesc &self) { return self.mutable_blocks()->Mutable(0); },
+           py::return_value_policy::reference)
+      .def("__str__", [](ProgramDesc &self) { return self.DebugString(); });
+}
+
+void bind_block_desc(py::module &m) {
+  py::class_<BlockDesc>(m, "BlockDesc", "")
+      .def("id", [](BlockDesc &self) { return self.idx(); })
+      .def("parent", [](BlockDesc &self) { return self.parent_idx(); })
+      .def("append_op",
+           [](BlockDesc &self) { return self.add_ops(); },
+           py::return_value_policy::reference)
+      .def("new_var",
+           [](BlockDesc &self) { return self.add_vars(); },
+           py::return_value_policy::reference);
+}
+
+void bind_var_dses(py::module &m) {
+  py::class_<VarDesc>(m, "VarDesc", "")
+      .def(py::init<>())
+      .def("set_name",
+           [](VarDesc &self, const std::string &name) { self.set_name(name); })
+      .def("set_shape",
+           [](VarDesc &self, const std::vector<int64_t> &dims) {
+             LoDTensorDesc *lod_tensor_desc = self.mutable_lod_tensor();
+             for (const int64_t &i : dims) {
+               lod_tensor_desc->add_dims(i);
+             }
+           })
+      .def("set_data_type",
+           [](VarDesc &self, int type_id) {
+             LoDTensorDesc *lod_tensor_desc = self.mutable_lod_tensor();
+             lod_tensor_desc->set_data_type(static_cast<DataType>(type_id));
+           })
+      .def("shape", [](VarDesc &self) {
+        const LoDTensorDesc &lod_tensor_desc = self.lod_tensor();
+        int rank = lod_tensor_desc.dims_size();
+        std::vector<int64_t> res(rank);
+        for (int i = 0; i < rank; ++i) {
+          res[i] = lod_tensor_desc.dims(i);
+        }
+        return res;
+      });
+}
+
+void bind_op_desc(py::module &m) {
+  auto op_desc_set_var = [](OpDesc::Var *var,
+                            const std::string &parameter,
+                            const std::vector<std::string> &arguments) {
+    var->set_parameter(parameter);
+    VectorToRepeated(arguments, var->mutable_arguments());
+  };
+
+  auto op_desc_set_attr = [](OpDesc &desc, const std::string &name) {
+    auto attr = desc.add_attrs();
+    attr->set_name(name);
+    return attr;
+  };
+
+  py::class_<OpDesc>(m, "OpDesc", "")
+      .def("type", [](OpDesc &op) { return op.type(); })
+      .def("set_input",
+           [op_desc_set_var](OpDesc &self,
+                             const std::string &parameter,
+                             const std::vector<std::string> &arguments) {
+             auto ipt = self.add_inputs();
+             op_desc_set_var(ipt, parameter, arguments);
+           })
+      .def("input_names",
+           [](OpDesc &self) {
+             std::vector<std::string> ret_val;
+             ret_val.reserve(static_cast<size_t>(self.inputs().size()));
+             std::transform(
+                 self.inputs().begin(),
+                 self.inputs().end(),
+                 std::back_inserter(ret_val),
+                 [](const OpDesc::Var &var) { return var.parameter(); });
+             return ret_val;
+           })
+      .def("__str__", [](OpDesc &self) { return self.DebugString(); })
+      .def("set_output",
+           [op_desc_set_var](OpDesc &self,
+                             const std::string &parameter,
+                             const std::vector<std::string> &arguments) {
+             auto opt = self.add_outputs();
+             op_desc_set_var(opt, parameter, arguments);
+           })
+      .def("set_attr",
+           [op_desc_set_attr](OpDesc &self, const std::string &name, int i) {
+             op_desc_set_attr(self, name)->set_i(i);
+           });
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/pybind/protobuf.h b/paddle/pybind/protobuf.h
new file mode 100644
index 0000000000..ff4813cce7
--- /dev/null
+++ b/paddle/pybind/protobuf.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <Python.h>
+#include <fstream>
+#include <vector>
+#include "paddle/framework/op_registry.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+inline std::vector<T> RepeatedToVector(
+    const google::protobuf::RepeatedField<T>& repeated_field) {
+  std::vector<T> ret;
+  ret.reserve(repeated_field.size());
+  std::copy(
+      repeated_field.begin(), repeated_field.end(), std::back_inserter(ret));
+  return ret;
+}
+
+template <typename T, typename RepeatedField>
+inline void VectorToRepeated(const std::vector<T>& vec,
+                             RepeatedField* repeated_field) {
+  repeated_field->Reserve(vec.size());
+  for (auto& elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+void bind_program_desc(py::module& m);
+void bind_block_desc(py::module& m);
+void bind_var_dses(py::module& m);
+void bind_op_desc(py::module& m);
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 5ccc8c377f..10c6670e00 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -12,13 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <Python.h>
-#include <fstream>
-#include <vector>
+#include "paddle/pybind/protobuf.h"
 
 #include "paddle/framework/backward.h"
 #include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/cond_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
@@ -27,11 +24,6 @@ limitations under the License. */
 #include "paddle/pybind/pybind.h"
 #include "paddle/pybind/tensor_py.h"
 #include "paddle/string/to_string.h"
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-
-namespace py = pybind11;
 
 namespace paddle {
 namespace framework {
@@ -53,25 +45,6 @@ bool IsCompileGPU() {
 #endif
 }
 
-template <typename T>
-inline std::vector<T> RepeatedToVector(
-    const google::protobuf::RepeatedField<T> &repeated_field) {
-  std::vector<T> ret;
-  ret.reserve(repeated_field.size());
-  std::copy(
-      repeated_field.begin(), repeated_field.end(), std::back_inserter(ret));
-  return ret;
-}
-
-template <typename T, typename RepeatedField>
-inline void VectorToRepeated(const std::vector<T> &vec,
-                             RepeatedField *repeated_field) {
-  repeated_field->Reserve(vec.size());
-  for (auto &elem : vec) {
-    *repeated_field->Add() = elem;
-  }
-}
-
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of PaddlePaddle");
 
@@ -334,113 +307,10 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("is_compile_gpu", IsCompileGPU);
 
-  py::class_<ProgramDesc>(m, "ProgramDesc", "")
-      .def_static("instance",
-                  [] { return &GetProgramDesc(); },
-                  py::return_value_policy::reference)
-      .def_static("__create_program_desc__",
-                  [] {
-                    // Only used for unit-test
-                    auto *prog_desc = new ProgramDesc;
-                    auto *block = prog_desc->mutable_blocks()->Add();
-                    block->set_idx(0);
-                    block->set_parent_idx(-1);
-                    return prog_desc;
-                  })
-      .def("append_block",
-           [](ProgramDesc &self, BlockDesc &parent) {
-             auto desc = self.add_blocks();
-             desc->set_idx(self.mutable_blocks()->size() - 1);
-             desc->set_parent_idx(parent.idx());
-             return desc;
-           },
-           py::return_value_policy::reference)
-      .def("root_block",
-           [](ProgramDesc &self) { return self.mutable_blocks()->Mutable(0); },
-           py::return_value_policy::reference)
-      .def("__str__", [](ProgramDesc &self) { return self.DebugString(); });
-
-  py::class_<BlockDesc>(m, "BlockDesc", "")
-      .def("id", [](BlockDesc &self) { return self.idx(); })
-      .def("parent", [](BlockDesc &self) { return self.parent_idx(); })
-      .def("append_op",
-           [](BlockDesc &self) { return self.add_ops(); },
-           py::return_value_policy::reference)
-      .def("new_var",
-           [](BlockDesc &self) { return self.add_vars(); },
-           py::return_value_policy::reference);
-
-  py::class_<VarDesc>(m, "VarDesc", "")
-      .def(py::init<>())
-      .def("set_name",
-           [](VarDesc &self, const std::string &name) { self.set_name(name); })
-      .def("set_shape",
-           [](VarDesc &self, const std::vector<int64_t> &dims) {
-             LoDTensorDesc *lod_tensor_desc = self.mutable_lod_tensor();
-             for (const int64_t &i : dims) {
-               lod_tensor_desc->add_dims(i);
-             }
-           })
-      .def("set_data_type",
-           [](VarDesc &self, int type_id) {
-             LoDTensorDesc *lod_tensor_desc = self.mutable_lod_tensor();
-             lod_tensor_desc->set_data_type(static_cast<DataType>(type_id));
-           })
-      .def("shape", [](VarDesc &self) {
-        const LoDTensorDesc &lod_tensor_desc = self.lod_tensor();
-        int rank = lod_tensor_desc.dims_size();
-        std::vector<int64_t> res(rank);
-        for (int i = 0; i < rank; ++i) {
-          res[i] = lod_tensor_desc.dims(i);
-        }
-        return res;
-      });
-
-  auto op_desc_set_var = [](OpDesc::Var *var,
-                            const std::string &parameter,
-                            const std::vector<std::string> &arguments) {
-    var->set_parameter(parameter);
-    VectorToRepeated(arguments, var->mutable_arguments());
-  };
-
-  auto op_desc_set_attr = [](OpDesc &desc, const std::string &name) {
-    auto attr = desc.add_attrs();
-    attr->set_name(name);
-    return attr;
-  };
-
-  py::class_<OpDesc>(m, "OpDesc", "")
-      .def("type", [](OpDesc &op) { return op.type(); })
-      .def("set_input",
-           [op_desc_set_var](OpDesc &self,
-                             const std::string &parameter,
-                             const std::vector<std::string> &arguments) {
-             auto ipt = self.add_inputs();
-             op_desc_set_var(ipt, parameter, arguments);
-           })
-      .def("input_names",
-           [](OpDesc &self) {
-             std::vector<std::string> ret_val;
-             ret_val.reserve(static_cast<size_t>(self.inputs().size()));
-             std::transform(
-                 self.inputs().begin(),
-                 self.inputs().end(),
-                 std::back_inserter(ret_val),
-                 [](const OpDesc::Var &var) { return var.parameter(); });
-             return ret_val;
-           })
-      .def("__str__", [](OpDesc &self) { return self.DebugString(); })
-      .def("set_output",
-           [op_desc_set_var](OpDesc &self,
-                             const std::string &parameter,
-                             const std::vector<std::string> &arguments) {
-             auto opt = self.add_outputs();
-             op_desc_set_var(opt, parameter, arguments);
-           })
-      .def("set_attr",
-           [op_desc_set_attr](OpDesc &self, const std::string &name, int i) {
-             op_desc_set_attr(self, name)->set_i(i);
-           });
+  bind_program_desc(m);
+  bind_block_desc(m);
+  bind_var_dses(m);
+  bind_op_desc(m);
 
   return m.ptr();
 }

From 37fd8fa1b6ec75ac447a93bea990338550402baf Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 22 Sep 2017 11:39:36 -0700
Subject: [PATCH 17/51] Fix typo

---
 paddle/pybind/protobuf.cc | 8 ++++----
 paddle/pybind/protobuf.h  | 8 ++++----
 paddle/pybind/pybind.cc   | 8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 91f4c7d7c8..47b3c43ebf 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void bind_program_desc(py::module &m) {
+void BindProgramDesc(py::module &m) {
   py::class_<ProgramDesc>(m, "ProgramDesc", "")
       .def_static("instance",
                   [] { return &GetProgramDesc(); },
@@ -45,7 +45,7 @@ void bind_program_desc(py::module &m) {
       .def("__str__", [](ProgramDesc &self) { return self.DebugString(); });
 }
 
-void bind_block_desc(py::module &m) {
+void BindBlockDesc(py::module &m) {
   py::class_<BlockDesc>(m, "BlockDesc", "")
       .def("id", [](BlockDesc &self) { return self.idx(); })
       .def("parent", [](BlockDesc &self) { return self.parent_idx(); })
@@ -57,7 +57,7 @@ void bind_block_desc(py::module &m) {
            py::return_value_policy::reference);
 }
 
-void bind_var_dses(py::module &m) {
+void BindVarDsec(py::module &m) {
   py::class_<VarDesc>(m, "VarDesc", "")
       .def(py::init<>())
       .def("set_name",
@@ -85,7 +85,7 @@ void bind_var_dses(py::module &m) {
       });
 }
 
-void bind_op_desc(py::module &m) {
+void BindOpDesc(py::module &m) {
   auto op_desc_set_var = [](OpDesc::Var *var,
                             const std::string &parameter,
                             const std::vector<std::string> &arguments) {
diff --git a/paddle/pybind/protobuf.h b/paddle/pybind/protobuf.h
index ff4813cce7..a32acfb038 100644
--- a/paddle/pybind/protobuf.h
+++ b/paddle/pybind/protobuf.h
@@ -46,9 +46,9 @@ inline void VectorToRepeated(const std::vector<T>& vec,
   }
 }
 
-void bind_program_desc(py::module& m);
-void bind_block_desc(py::module& m);
-void bind_var_dses(py::module& m);
-void bind_op_desc(py::module& m);
+void BindProgramDesc(py::module& m);
+void BindBlockDesc(py::module& m);
+void BindVarDsec(py::module& m);
+void BindOpDesc(py::module& m);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 10c6670e00..d9dd7523bf 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -307,10 +307,10 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("is_compile_gpu", IsCompileGPU);
 
-  bind_program_desc(m);
-  bind_block_desc(m);
-  bind_var_dses(m);
-  bind_op_desc(m);
+  BindProgramDesc(m);
+  BindBlockDesc(m);
+  BindVarDsec(m);
+  BindOpDesc(m);
 
   return m.ptr();
 }

From f5aa8b4d7ef508dcd66984ef36012eeff63a9c85 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 22 Sep 2017 13:22:21 -0700
Subject: [PATCH 18/51] Update namespace of pybind/protobuf.cc and .h

---
 paddle/pybind/protobuf.cc                             | 11 +++++++++--
 paddle/pybind/protobuf.h                              |  4 ++--
 .../paddle/v2/framework/tests/test_protobuf_descs.py  |  1 +
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 47b3c43ebf..bfbe177e8f 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -15,9 +15,10 @@ limitations under the License. */
 #include "paddle/pybind/protobuf.h"
 
 namespace paddle {
-namespace framework {
+namespace pybind {
 
 void BindProgramDesc(py::module &m) {
+  using namespace paddle::framework;  // NOLINT
   py::class_<ProgramDesc>(m, "ProgramDesc", "")
       .def_static("instance",
                   [] { return &GetProgramDesc(); },
@@ -42,10 +43,14 @@ void BindProgramDesc(py::module &m) {
       .def("root_block",
            [](ProgramDesc &self) { return self.mutable_blocks()->Mutable(0); },
            py::return_value_policy::reference)
+      .def("block",
+           [](ProgramDesc &self, int id) { return self.blocks(id); },
+           py::return_value_policy::reference)
       .def("__str__", [](ProgramDesc &self) { return self.DebugString(); });
 }
 
 void BindBlockDesc(py::module &m) {
+  using namespace paddle::framework;  // NOLINT
   py::class_<BlockDesc>(m, "BlockDesc", "")
       .def("id", [](BlockDesc &self) { return self.idx(); })
       .def("parent", [](BlockDesc &self) { return self.parent_idx(); })
@@ -58,6 +63,7 @@ void BindBlockDesc(py::module &m) {
 }
 
 void BindVarDsec(py::module &m) {
+  using namespace paddle::framework;  // NOLINT
   py::class_<VarDesc>(m, "VarDesc", "")
       .def(py::init<>())
       .def("set_name",
@@ -86,6 +92,7 @@ void BindVarDsec(py::module &m) {
 }
 
 void BindOpDesc(py::module &m) {
+  using namespace paddle::framework;  // NOLINT
   auto op_desc_set_var = [](OpDesc::Var *var,
                             const std::string &parameter,
                             const std::vector<std::string> &arguments) {
@@ -132,5 +139,5 @@ void BindOpDesc(py::module &m) {
              op_desc_set_attr(self, name)->set_i(i);
            });
 }
-}  // namespace framework
+}  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/pybind/protobuf.h b/paddle/pybind/protobuf.h
index a32acfb038..de9a008e25 100644
--- a/paddle/pybind/protobuf.h
+++ b/paddle/pybind/protobuf.h
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace py = pybind11;
 
 namespace paddle {
-namespace framework {
+namespace pybind {
 
 template <typename T>
 inline std::vector<T> RepeatedToVector(
@@ -50,5 +50,5 @@ void BindProgramDesc(py::module& m);
 void BindBlockDesc(py::module& m);
 void BindVarDsec(py::module& m);
 void BindOpDesc(py::module& m);
-}  // namespace framework
+}  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index 71bdca8765..d0192814ef 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -23,6 +23,7 @@ class TestProgramDesc(unittest.TestCase):
         self.assertEqual(block_root.id(), block1.parent())
         block3 = prog_desc.append_block(block_root)
         self.assertEqual(block3.parent(), block_root.id())
+        self.assertEqual(prog_desc.block(1).id(), 1)
 
 
 class TestVarDesc(unittest.TestCase):

From 027fc62cb020801cef53fb0e753d3a31fb7e6f39 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 22 Sep 2017 13:47:29 -0700
Subject: [PATCH 19/51] Use Vec2Repeated Repeated2Vec

---
 paddle/pybind/protobuf.cc | 31 +++++++++++++++++++++----------
 paddle/pybind/protobuf.h  | 19 -------------------
 2 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index bfbe177e8f..b86185bf5b 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -17,6 +17,25 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
+template <typename T>
+inline std::vector<T> RepeatedToVector(
+    const google::protobuf::RepeatedField<T> &repeated_field) {
+  std::vector<T> ret;
+  ret.reserve(repeated_field.size());
+  std::copy(
+      repeated_field.begin(), repeated_field.end(), std::back_inserter(ret));
+  return ret;
+}
+
+template <typename T, typename RepeatedField>
+inline void VectorToRepeated(const std::vector<T> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Reserve(vec.size());
+  for (auto &elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
 void BindProgramDesc(py::module &m) {
   using namespace paddle::framework;  // NOLINT
   py::class_<ProgramDesc>(m, "ProgramDesc", "")
@@ -70,10 +89,7 @@ void BindVarDsec(py::module &m) {
            [](VarDesc &self, const std::string &name) { self.set_name(name); })
       .def("set_shape",
            [](VarDesc &self, const std::vector<int64_t> &dims) {
-             LoDTensorDesc *lod_tensor_desc = self.mutable_lod_tensor();
-             for (const int64_t &i : dims) {
-               lod_tensor_desc->add_dims(i);
-             }
+             VectorToRepeated(dims, self.mutable_lod_tensor()->mutable_dims());
            })
       .def("set_data_type",
            [](VarDesc &self, int type_id) {
@@ -82,12 +98,7 @@ void BindVarDsec(py::module &m) {
            })
       .def("shape", [](VarDesc &self) {
         const LoDTensorDesc &lod_tensor_desc = self.lod_tensor();
-        int rank = lod_tensor_desc.dims_size();
-        std::vector<int64_t> res(rank);
-        for (int i = 0; i < rank; ++i) {
-          res[i] = lod_tensor_desc.dims(i);
-        }
-        return res;
+        return RepeatedToVector(lod_tensor_desc.dims());
       });
 }
 
diff --git a/paddle/pybind/protobuf.h b/paddle/pybind/protobuf.h
index de9a008e25..2721c128d1 100644
--- a/paddle/pybind/protobuf.h
+++ b/paddle/pybind/protobuf.h
@@ -27,25 +27,6 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-template <typename T>
-inline std::vector<T> RepeatedToVector(
-    const google::protobuf::RepeatedField<T>& repeated_field) {
-  std::vector<T> ret;
-  ret.reserve(repeated_field.size());
-  std::copy(
-      repeated_field.begin(), repeated_field.end(), std::back_inserter(ret));
-  return ret;
-}
-
-template <typename T, typename RepeatedField>
-inline void VectorToRepeated(const std::vector<T>& vec,
-                             RepeatedField* repeated_field) {
-  repeated_field->Reserve(vec.size());
-  for (auto& elem : vec) {
-    *repeated_field->Add() = elem;
-  }
-}
-
 void BindProgramDesc(py::module& m);
 void BindBlockDesc(py::module& m);
 void BindVarDsec(py::module& m);

From bddb40609d604cd68f6418423147ec1ec5ec8de0 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 22 Sep 2017 15:35:48 -0700
Subject: [PATCH 20/51] Buggy code

---
 paddle/pybind/protobuf.cc                     | 285 ++++++++++++------
 .../v2/framework/tests/test_protobuf_descs.py |  21 +-
 2 files changed, 214 insertions(+), 92 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index b86185bf5b..b4ed9c4335 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/pybind/protobuf.h"
+#include <deque>
 
 namespace paddle {
 namespace pybind {
 
+using namespace paddle::framework;  // NOLINT
+
 template <typename T>
 inline std::vector<T> RepeatedToVector(
     const google::protobuf::RepeatedField<T> &repeated_field) {
@@ -36,45 +39,154 @@ inline void VectorToRepeated(const std::vector<T> &vec,
   }
 }
 
+class ProgramDescBind;
+class OpDescBind;
+class BlockDescBind;
+
+class OpDescBind {
+public:
+  explicit OpDescBind(BlockDescBind *block) : block_(block) {}
+
+  operator OpDesc *() { return &op_desc_; }
+
+private:
+  BlockDescBind *block_;
+  OpDesc op_desc_;
+};
+
+class BlockDescBind {
+public:
+  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
+      : prog_(prog), desc_(desc), need_update_(false) {}
+
+  ~BlockDescBind() {
+    std::cerr << "dtor " << this << "," << desc_ << std::endl;
+  }
+
+  int32_t id() const {
+    std::cerr << "desc ptr " << desc_ << std::endl;
+    return desc_->idx();
+  }
+
+  int32_t Parent() const { return desc_->parent_idx(); }
+
+  OpDescBind *AppendOp() {
+    need_update_ = true;
+    ops_.emplace_back(this);
+    return &ops_.back();
+  }
+
+  void Sync() {
+    if (need_update_) {
+      auto &op_field = *this->desc_->mutable_ops();
+      op_field.Clear();
+      op_field.Reserve(static_cast<int>(ops_.size()));
+      for (auto &op_desc : ops_) {
+        op_field.AddAllocated(op_desc);
+      }
+    }
+  }
+
+private:
+  ProgramDescBind *prog_;  // not_own
+  BlockDesc *desc_;        // not_own
+  bool need_update_;
+
+  std::deque<OpDescBind> ops_;
+};
+
+using ProgDescMap =
+    std::unordered_map<ProgramDesc *, std::unique_ptr<ProgramDescBind>>;
+static ProgDescMap *g_bind_map = nullptr;
+
+class ProgramDescBind {
+public:
+  static ProgramDescBind &Instance(ProgramDesc *prog) {
+    if (g_bind_map == nullptr) {
+      g_bind_map = new ProgDescMap();
+    }
+    auto &map = *g_bind_map;
+    auto &ptr = map[prog];
+
+    if (ptr == nullptr) {
+      ptr.reset(new ProgramDescBind(prog));
+    }
+    return *ptr;
+  }
+
+  BlockDescBind *AppendBlock(BlockDescBind *parent) {
+    auto *b = prog_->add_blocks();
+    std::cerr << "block ptr " << b << std::endl;
+    std::cerr << "pass ptr " << parent << std::endl;
+    b->set_parent_idx(parent->id());
+    b->set_idx(prog_->blocks_size() - 1);
+    blocks_.emplace_back(this, b);
+    return &blocks_.back();
+  }
+
+  BlockDescBind *Root() { return &blocks_.front(); }
+
+  BlockDescBind *Block(size_t idx) { return &blocks_[idx]; }
+
+  std::string DebugString() { return Proto()->DebugString(); }
+
+  size_t Size() const { return blocks_.size(); }
+
+  ProgramDesc *Proto() {
+    for (auto &block : blocks_) {
+      block.Sync();
+    }
+    return prog_;
+  }
+
+private:
+  explicit ProgramDescBind(ProgramDesc *prog) : prog_(prog) {
+    for (auto &block : *prog->mutable_blocks()) {
+      blocks_.emplace_back(this, &block);
+    }
+  }
+
+  // Not owned
+  ProgramDesc *prog_;
+
+  std::vector<BlockDescBind> blocks_;
+};
+
 void BindProgramDesc(py::module &m) {
-  using namespace paddle::framework;  // NOLINT
-  py::class_<ProgramDesc>(m, "ProgramDesc", "")
+  py::class_<ProgramDescBind>(m, "ProgramDesc", "")
       .def_static("instance",
-                  [] { return &GetProgramDesc(); },
+                  []() -> ProgramDescBind * {
+                    return &ProgramDescBind::Instance(&GetProgramDesc());
+                  },
                   py::return_value_policy::reference)
       .def_static("__create_program_desc__",
-                  [] {
+                  []() -> ProgramDescBind * {
                     // Only used for unit-test
                     auto *prog_desc = new ProgramDesc;
                     auto *block = prog_desc->mutable_blocks()->Add();
                     block->set_idx(0);
                     block->set_parent_idx(-1);
-                    return prog_desc;
-                  })
+                    return &ProgramDescBind::Instance(prog_desc);
+                  },
+                  py::return_value_policy::reference)
       .def("append_block",
-           [](ProgramDesc &self, BlockDesc &parent) {
-             auto desc = self.add_blocks();
-             desc->set_idx(self.mutable_blocks()->size() - 1);
-             desc->set_parent_idx(parent.idx());
-             return desc;
-           },
+           &ProgramDescBind::AppendBlock,
            py::return_value_policy::reference)
       .def("root_block",
-           [](ProgramDesc &self) { return self.mutable_blocks()->Mutable(0); },
+           &ProgramDescBind::Root,
            py::return_value_policy::reference)
-      .def("block",
-           [](ProgramDesc &self, int id) { return self.blocks(id); },
-           py::return_value_policy::reference)
-      .def("__str__", [](ProgramDesc &self) { return self.DebugString(); });
+      .def("block", &ProgramDescBind::Block, py::return_value_policy::reference)
+      .def("__str__", &ProgramDescBind::DebugString)
+      .def("num_blocks", &ProgramDescBind::Size);
 }
 
 void BindBlockDesc(py::module &m) {
   using namespace paddle::framework;  // NOLINT
-  py::class_<BlockDesc>(m, "BlockDesc", "")
-      .def("id", [](BlockDesc &self) { return self.idx(); })
-      .def("parent", [](BlockDesc &self) { return self.parent_idx(); })
+  py::class_<BlockDescBind>(m, "BlockDesc", "")
+      .def_property_readonly("id", &BlockDescBind::id)
+      .def_property_readonly("parent", &BlockDescBind::Parent)
       .def("append_op",
-           [](BlockDesc &self) { return self.add_ops(); },
+           &BlockDescBind::AppendOp,
            py::return_value_policy::reference)
       .def("new_var",
            [](BlockDesc &self) { return self.add_vars(); },
@@ -82,73 +194,76 @@ void BindBlockDesc(py::module &m) {
 }
 
 void BindVarDsec(py::module &m) {
-  using namespace paddle::framework;  // NOLINT
-  py::class_<VarDesc>(m, "VarDesc", "")
-      .def(py::init<>())
-      .def("set_name",
-           [](VarDesc &self, const std::string &name) { self.set_name(name); })
-      .def("set_shape",
-           [](VarDesc &self, const std::vector<int64_t> &dims) {
-             VectorToRepeated(dims, self.mutable_lod_tensor()->mutable_dims());
-           })
-      .def("set_data_type",
-           [](VarDesc &self, int type_id) {
-             LoDTensorDesc *lod_tensor_desc = self.mutable_lod_tensor();
-             lod_tensor_desc->set_data_type(static_cast<DataType>(type_id));
-           })
-      .def("shape", [](VarDesc &self) {
-        const LoDTensorDesc &lod_tensor_desc = self.lod_tensor();
-        return RepeatedToVector(lod_tensor_desc.dims());
-      });
+  py::class_<VarDesc>(m, "VarDesc", "");
+  //  using namespace paddle::framework;  // NOLINT
+  //  py::class_<VarDesc>(m, "VarDesc", "")
+  //      .def(py::init<>())
+  //      .def("set_name",
+  //           [](VarDesc &self, const std::string &name) { self.set_name(name);
+  //           })
+  //      .def("set_shape",
+  //           [](VarDesc &self, const std::vector<int64_t> &dims) {
+  //             VectorToRepeated(dims,
+  //             self.mutable_lod_tensor()->mutable_dims());
+  //           })
+  //      .def("set_data_type",
+  //           [](VarDesc &self, int type_id) {
+  //             LoDTensorDesc *lod_tensor_desc = self.mutable_lod_tensor();
+  //             lod_tensor_desc->set_data_type(static_cast<DataType>(type_id));
+  //           })
+  //      .def("shape", [](VarDesc &self) {
+  //        const LoDTensorDesc &lod_tensor_desc = self.lod_tensor();
+  //        return RepeatedToVector(lod_tensor_desc.dims());
+  //      });
 }
 
 void BindOpDesc(py::module &m) {
-  using namespace paddle::framework;  // NOLINT
-  auto op_desc_set_var = [](OpDesc::Var *var,
-                            const std::string &parameter,
-                            const std::vector<std::string> &arguments) {
-    var->set_parameter(parameter);
-    VectorToRepeated(arguments, var->mutable_arguments());
-  };
-
-  auto op_desc_set_attr = [](OpDesc &desc, const std::string &name) {
-    auto attr = desc.add_attrs();
-    attr->set_name(name);
-    return attr;
-  };
-
-  py::class_<OpDesc>(m, "OpDesc", "")
-      .def("type", [](OpDesc &op) { return op.type(); })
-      .def("set_input",
-           [op_desc_set_var](OpDesc &self,
-                             const std::string &parameter,
-                             const std::vector<std::string> &arguments) {
-             auto ipt = self.add_inputs();
-             op_desc_set_var(ipt, parameter, arguments);
-           })
-      .def("input_names",
-           [](OpDesc &self) {
-             std::vector<std::string> ret_val;
-             ret_val.reserve(static_cast<size_t>(self.inputs().size()));
-             std::transform(
-                 self.inputs().begin(),
-                 self.inputs().end(),
-                 std::back_inserter(ret_val),
-                 [](const OpDesc::Var &var) { return var.parameter(); });
-             return ret_val;
-           })
-      .def("__str__", [](OpDesc &self) { return self.DebugString(); })
-      .def("set_output",
-           [op_desc_set_var](OpDesc &self,
-                             const std::string &parameter,
-                             const std::vector<std::string> &arguments) {
-             auto opt = self.add_outputs();
-             op_desc_set_var(opt, parameter, arguments);
-           })
-      .def("set_attr",
-           [op_desc_set_attr](OpDesc &self, const std::string &name, int i) {
-             op_desc_set_attr(self, name)->set_i(i);
-           });
+  //  auto op_desc_set_var = [](OpDesc::Var *var,
+  //                            const std::string &parameter,
+  //                            const std::vector<std::string> &arguments) {
+  //    var->set_parameter(parameter);
+  //    VectorToRepeated(arguments, var->mutable_arguments());
+  //  };
+  //
+  //  auto op_desc_set_attr = [](OpDesc &desc, const std::string &name) {
+  //    auto attr = desc.add_attrs();
+  //    attr->set_name(name);
+  //    return attr;
+  //  };
+  py::class_<OpDescBind>(m, "OpDesc", "");
+
+  //      .def("type", [](OpDesc &op) { return op.type(); })
+  //      .def("set_input",
+  //           [op_desc_set_var](OpDesc &self,
+  //                             const std::string &parameter,
+  //                             const std::vector<std::string> &arguments) {
+  //             auto ipt = self.add_inputs();
+  //             op_desc_set_var(ipt, parameter, arguments);
+  //           })
+  //      .def("input_names",
+  //           [](OpDesc &self) {
+  //             std::vector<std::string> ret_val;
+  //             ret_val.reserve(static_cast<size_t>(self.inputs().size()));
+  //             std::transform(
+  //                 self.inputs().begin(),
+  //                 self.inputs().end(),
+  //                 std::back_inserter(ret_val),
+  //                 [](const OpDesc::Var &var) { return var.parameter(); });
+  //             return ret_val;
+  //           })
+  //      .def("__str__", [](OpDesc &self) { return self.DebugString(); })
+  //      .def("set_output",
+  //           [op_desc_set_var](OpDesc &self,
+  //                             const std::string &parameter,
+  //                             const std::vector<std::string> &arguments) {
+  //             auto opt = self.add_outputs();
+  //             op_desc_set_var(opt, parameter, arguments);
+  //           })
+  //      .def("set_attr",
+  //           [op_desc_set_attr](OpDesc &self, const std::string &name, int i)
+  //           {
+  //             op_desc_set_attr(self, name)->set_i(i);
+  //           });
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index d0192814ef..b5ff2d4c36 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -9,21 +9,28 @@ class TestProgramDesc(unittest.TestCase):
         del program_desc
         program_desc = core.ProgramDesc.instance()
         self.assertIsNotNone(program_desc)
-        self.assertIsNotNone(program_desc.root_block())
+        self.assertIsNotNone(program_desc.block(0))
         del program_desc
 
     def test_append_block(self):
         prog_desc = core.ProgramDesc.__create_program_desc__()
         self.assertIsNotNone(prog_desc)
-        block_root = prog_desc.root_block()
-        self.assertEqual(block_root.id(), 0)
+        block_root = prog_desc.block(0)
+        self.assertIsNotNone(block_root)
+        print 'here'
+        self.assertEqual(block_root.id, 0)
         block1 = prog_desc.append_block(block_root)
         block2 = prog_desc.append_block(block1)
-        self.assertEqual(block1.id(), block2.parent())
-        self.assertEqual(block_root.id(), block1.parent())
+        self.assertIsNotNone(block1)
+        print 'here'
+        self.assertEqual(block1.id, block2.parent)
+        print 'here'
+        self.assertEqual(block_root.id, block1.parent)
+        print 'here'
         block3 = prog_desc.append_block(block_root)
-        self.assertEqual(block3.parent(), block_root.id())
-        self.assertEqual(prog_desc.block(1).id(), 1)
+        self.assertEqual(block3.parent, block_root.id)
+        self.assertEqual(prog_desc.block(1).id, 1)
+        self.assertEqual(4, prog_desc.num_blocks())
 
 
 class TestVarDesc(unittest.TestCase):

From dc643a33523b48ace8e05dcfe0167b21d3687631 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 22 Sep 2017 15:42:14 -0700
Subject: [PATCH 21/51] Hot fix unittest

---
 paddle/pybind/protobuf.cc                       | 17 ++++-------------
 .../v2/framework/tests/test_protobuf_descs.py   |  4 ----
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index b4ed9c4335..0fb78bf7a2 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -59,14 +59,7 @@ public:
   BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
       : prog_(prog), desc_(desc), need_update_(false) {}
 
-  ~BlockDescBind() {
-    std::cerr << "dtor " << this << "," << desc_ << std::endl;
-  }
-
-  int32_t id() const {
-    std::cerr << "desc ptr " << desc_ << std::endl;
-    return desc_->idx();
-  }
+  int32_t id() const { return desc_->idx(); }
 
   int32_t Parent() const { return desc_->parent_idx(); }
 
@@ -114,11 +107,9 @@ public:
     return *ptr;
   }
 
-  BlockDescBind *AppendBlock(BlockDescBind *parent) {
+  BlockDescBind *AppendBlock(const BlockDescBind &parent) {
     auto *b = prog_->add_blocks();
-    std::cerr << "block ptr " << b << std::endl;
-    std::cerr << "pass ptr " << parent << std::endl;
-    b->set_parent_idx(parent->id());
+    b->set_parent_idx(parent.id());
     b->set_idx(prog_->blocks_size() - 1);
     blocks_.emplace_back(this, b);
     return &blocks_.back();
@@ -141,6 +132,7 @@ public:
 
 private:
   explicit ProgramDescBind(ProgramDesc *prog) : prog_(prog) {
+    blocks_.reserve(100);
     for (auto &block : *prog->mutable_blocks()) {
       blocks_.emplace_back(this, &block);
     }
@@ -181,7 +173,6 @@ void BindProgramDesc(py::module &m) {
 }
 
 void BindBlockDesc(py::module &m) {
-  using namespace paddle::framework;  // NOLINT
   py::class_<BlockDescBind>(m, "BlockDesc", "")
       .def_property_readonly("id", &BlockDescBind::id)
       .def_property_readonly("parent", &BlockDescBind::Parent)
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index b5ff2d4c36..fbe1f7152b 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -17,16 +17,12 @@ class TestProgramDesc(unittest.TestCase):
         self.assertIsNotNone(prog_desc)
         block_root = prog_desc.block(0)
         self.assertIsNotNone(block_root)
-        print 'here'
         self.assertEqual(block_root.id, 0)
         block1 = prog_desc.append_block(block_root)
         block2 = prog_desc.append_block(block1)
         self.assertIsNotNone(block1)
-        print 'here'
         self.assertEqual(block1.id, block2.parent)
-        print 'here'
         self.assertEqual(block_root.id, block1.parent)
-        print 'here'
         block3 = prog_desc.append_block(block_root)
         self.assertEqual(block3.parent, block_root.id)
         self.assertEqual(prog_desc.block(1).id, 1)

From e05e27a7f71ddb6549e406f0fbc339c789373935 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 22 Sep 2017 16:59:15 -0700
Subject: [PATCH 22/51] Fix bug

---
 paddle/pybind/protobuf.cc | 64 +++++++++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 16 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 0fb78bf7a2..5511841c8b 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -42,15 +42,23 @@ inline void VectorToRepeated(const std::vector<T> &vec,
 class ProgramDescBind;
 class OpDescBind;
 class BlockDescBind;
+class VarDescBind;
 
-class OpDescBind {
+class VarDescBind {
 public:
-  explicit OpDescBind(BlockDescBind *block) : block_(block) {}
+  explicit VarDescBind(const std::string &name) { var_desc_.set_name(name); }
+
+  VarDesc *Proto() { return &var_desc_; }
+
+private:
+  VarDesc var_desc_;
+};
 
-  operator OpDesc *() { return &op_desc_; }
+class OpDescBind {
+public:
+  OpDesc *Proto() { return &op_desc_; }
 
 private:
-  BlockDescBind *block_;
   OpDesc op_desc_;
 };
 
@@ -59,14 +67,28 @@ public:
   BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
       : prog_(prog), desc_(desc), need_update_(false) {}
 
+  BlockDescBind(const BlockDescBind &o) = delete;
+  BlockDescBind &operator=(const BlockDescBind &o) = delete;
+
   int32_t id() const { return desc_->idx(); }
 
   int32_t Parent() const { return desc_->parent_idx(); }
 
+  VarDescBind *NewVar(const std::string &name) {
+    need_update_ = true;
+    auto it = vars_.find(name);
+    PADDLE_ENFORCE(it == vars_.end(), "Duplicated variable %s", name);
+    auto var = new VarDescBind(name);
+    vars_[name].reset(var);
+    return var;
+  }
+
+  BlockDescBind *ParentBlock() const;
+
   OpDescBind *AppendOp() {
     need_update_ = true;
-    ops_.emplace_back(this);
-    return &ops_.back();
+    ops_.emplace_back(new OpDescBind());
+    return ops_.back().get();
   }
 
   void Sync() {
@@ -75,8 +97,9 @@ public:
       op_field.Clear();
       op_field.Reserve(static_cast<int>(ops_.size()));
       for (auto &op_desc : ops_) {
-        op_field.AddAllocated(op_desc);
+        op_field.AddAllocated(op_desc->Proto());
       }
+      need_update_ = false;
     }
   }
 
@@ -85,7 +108,8 @@ private:
   BlockDesc *desc_;        // not_own
   bool need_update_;
 
-  std::deque<OpDescBind> ops_;
+  std::deque<std::unique_ptr<OpDescBind>> ops_;
+  std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
 };
 
 using ProgDescMap =
@@ -106,18 +130,20 @@ public:
     }
     return *ptr;
   }
+  ProgramDescBind(const ProgramDescBind &o) = delete;
+  ProgramDescBind &operator=(const ProgramDescBind &o) = delete;
 
   BlockDescBind *AppendBlock(const BlockDescBind &parent) {
     auto *b = prog_->add_blocks();
     b->set_parent_idx(parent.id());
     b->set_idx(prog_->blocks_size() - 1);
-    blocks_.emplace_back(this, b);
-    return &blocks_.back();
+    blocks_.emplace_back(new BlockDescBind(this, b));
+    return blocks_.back().get();
   }
 
-  BlockDescBind *Root() { return &blocks_.front(); }
+  BlockDescBind *Root() { return blocks_.front().get(); }
 
-  BlockDescBind *Block(size_t idx) { return &blocks_[idx]; }
+  BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }
 
   std::string DebugString() { return Proto()->DebugString(); }
 
@@ -125,25 +151,31 @@ public:
 
   ProgramDesc *Proto() {
     for (auto &block : blocks_) {
-      block.Sync();
+      block->Sync();
     }
     return prog_;
   }
 
 private:
   explicit ProgramDescBind(ProgramDesc *prog) : prog_(prog) {
-    blocks_.reserve(100);
     for (auto &block : *prog->mutable_blocks()) {
-      blocks_.emplace_back(this, &block);
+      blocks_.emplace_back(new BlockDescBind(this, &block));
     }
   }
 
   // Not owned
   ProgramDesc *prog_;
 
-  std::vector<BlockDescBind> blocks_;
+  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
 };
 
+BlockDescBind *BlockDescBind::ParentBlock() const {
+  if (this->desc_->parent_idx() == -1) {
+    return nullptr;
+  }
+  return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
+}
+
 void BindProgramDesc(py::module &m) {
   py::class_<ProgramDescBind>(m, "ProgramDesc", "")
       .def_static("instance",

From eeb7c8ad795d6d7159d3659a2d41709653e2e347 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 22 Sep 2017 17:34:47 -0700
Subject: [PATCH 23/51] Compelete VarDescBind

---
 paddle/pybind/protobuf.cc | 44 ++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 5511841c8b..126c2ce1c7 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -46,12 +46,24 @@ class VarDescBind;
 
 class VarDescBind {
 public:
-  explicit VarDescBind(const std::string &name) { var_desc_.set_name(name); }
+  explicit VarDescBind(const std::string &name) { desc_.set_name(name); }
 
-  VarDesc *Proto() { return &var_desc_; }
+  VarDesc *Proto() { return &desc_; }
+
+  void SetShape(const vector<int64_t> &dims) {
+    VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims());
+  }
+
+  void SetDataType(int type_id) {
+    desc_.mutable_lod_tensor()->set_data_type(const_cast<DataType>(type_id));
+  }
+
+  std::vector<int64_t> Shape() {
+    return RepeatedToVector(desc_.lod_tensor().dims());
+  }
 
 private:
-  VarDesc var_desc_;
+  VarDesc desc_;
 };
 
 class OpDescBind {
@@ -217,27 +229,11 @@ void BindBlockDesc(py::module &m) {
 }
 
 void BindVarDsec(py::module &m) {
-  py::class_<VarDesc>(m, "VarDesc", "");
-  //  using namespace paddle::framework;  // NOLINT
-  //  py::class_<VarDesc>(m, "VarDesc", "")
-  //      .def(py::init<>())
-  //      .def("set_name",
-  //           [](VarDesc &self, const std::string &name) { self.set_name(name);
-  //           })
-  //      .def("set_shape",
-  //           [](VarDesc &self, const std::vector<int64_t> &dims) {
-  //             VectorToRepeated(dims,
-  //             self.mutable_lod_tensor()->mutable_dims());
-  //           })
-  //      .def("set_data_type",
-  //           [](VarDesc &self, int type_id) {
-  //             LoDTensorDesc *lod_tensor_desc = self.mutable_lod_tensor();
-  //             lod_tensor_desc->set_data_type(static_cast<DataType>(type_id));
-  //           })
-  //      .def("shape", [](VarDesc &self) {
-  //        const LoDTensorDesc &lod_tensor_desc = self.lod_tensor();
-  //        return RepeatedToVector(lod_tensor_desc.dims());
-  //      });
+  py::class_<VarDesc>(m, "VarDesc", "")
+      .def(py::init<>())
+      .def("set_shape", VarDescBind::SetShape)
+      .def("set_data_type", VarDescBind::SetDataType)
+      .def("shape", VarDescBind::Shape);
 }
 
 void BindOpDesc(py::module &m) {

From ddf2448484cb6d183032e8d616ed51176dea9ded Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 22 Sep 2017 17:46:48 -0700
Subject: [PATCH 24/51] Update Input/Output of Op

---
 paddle/pybind/protobuf.cc                     | 145 +++++++++++-------
 .../v2/framework/tests/test_protobuf_descs.py |  19 +++
 2 files changed, 112 insertions(+), 52 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 5511841c8b..67d6252af8 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/pybind/protobuf.h"
 #include <deque>
+#include "paddle/framework/attribute.h"
 
 namespace paddle {
 namespace pybind {
@@ -56,10 +57,90 @@ private:
 
 class OpDescBind {
 public:
-  OpDesc *Proto() { return &op_desc_; }
+  OpDesc *Proto() {
+    Sync();
+    return &op_desc_;
+  }
+
+  std::string Type() const { return op_desc_.type(); }
+
+  void SetType(const std::string &type) { op_desc_.set_type(type); }
+
+  const std::vector<std::string> &Input(const std::string &name) const {
+    auto it = inputs_.find(name);
+    PADDLE_ENFORCE(
+        it != inputs_.end(), "Input %s cannot be found in Op %s", name, Type());
+    return it->second;
+  }
+
+  std::vector<std::string> InputNames() const {
+    std::vector<std::string> retv;
+    retv.reserve(this->inputs_.size());
+    for (auto &ipt : this->inputs_) {
+      retv.push_back(ipt.first);
+    }
+    return retv;
+  }
+
+  void SetInput(const std::string &param_name,
+                const std::vector<std::string> &args) {
+    need_update_ = true;
+    inputs_[param_name] = args;
+  }
+
+  const std::vector<std::string> &Output(const std::string &name) const {
+    auto it = outputs_.find(name);
+    PADDLE_ENFORCE(it != outputs_.end(),
+                   "Output %s cannot be found in Op %s",
+                   name,
+                   Type());
+    return it->second;
+  }
+
+  std::vector<std::string> OutputNames() const {
+    std::vector<std::string> retv;
+    retv.reserve(this->outputs_.size());
+    for (auto &ipt : this->outputs_) {
+      retv.push_back(ipt.first);
+    }
+    return retv;
+  }
+
+  void SetOutput(const std::string &param_name,
+                 const std::vector<std::string> &args) {
+    need_update_ = true;
+    this->outputs_[param_name] = args;
+  }
+
+  std::string DebugString() { return this->Proto()->DebugString(); }
+
+  void Sync() {
+    if (need_update_) {
+      this->op_desc_.mutable_inputs()->Clear();
+      for (auto &ipt : inputs_) {
+        auto *input = op_desc_.add_inputs();
+        input->set_parameter(ipt.first);
+        VectorToRepeated(ipt.second, input->mutable_arguments());
+      }
+
+      this->op_desc_.mutable_outputs()->Clear();
+      for (auto &opt : outputs_) {
+        auto *output = op_desc_.add_outputs();
+        output->set_parameter(opt.first);
+        VectorToRepeated(opt.second, output->mutable_arguments());
+      }
+
+      need_update_ = false;
+    }
+  }
 
 private:
   OpDesc op_desc_;
+  std::unordered_map<std::string, std::vector<std::string>> inputs_;
+  std::unordered_map<std::string, std::vector<std::string>> outputs_;
+  std::unordered_map<std::string, Attribute> attrs_;
+
+  bool need_update_{false};
 };
 
 class BlockDescBind {
@@ -141,8 +222,6 @@ public:
     return blocks_.back().get();
   }
 
-  BlockDescBind *Root() { return blocks_.front().get(); }
-
   BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }
 
   std::string DebugString() { return Proto()->DebugString(); }
@@ -196,9 +275,6 @@ void BindProgramDesc(py::module &m) {
       .def("append_block",
            &ProgramDescBind::AppendBlock,
            py::return_value_policy::reference)
-      .def("root_block",
-           &ProgramDescBind::Root,
-           py::return_value_policy::reference)
       .def("block", &ProgramDescBind::Block, py::return_value_policy::reference)
       .def("__str__", &ProgramDescBind::DebugString)
       .def("num_blocks", &ProgramDescBind::Size);
@@ -241,52 +317,17 @@ void BindVarDsec(py::module &m) {
 }
 
 void BindOpDesc(py::module &m) {
-  //  auto op_desc_set_var = [](OpDesc::Var *var,
-  //                            const std::string &parameter,
-  //                            const std::vector<std::string> &arguments) {
-  //    var->set_parameter(parameter);
-  //    VectorToRepeated(arguments, var->mutable_arguments());
-  //  };
-  //
-  //  auto op_desc_set_attr = [](OpDesc &desc, const std::string &name) {
-  //    auto attr = desc.add_attrs();
-  //    attr->set_name(name);
-  //    return attr;
-  //  };
-  py::class_<OpDescBind>(m, "OpDesc", "");
-
-  //      .def("type", [](OpDesc &op) { return op.type(); })
-  //      .def("set_input",
-  //           [op_desc_set_var](OpDesc &self,
-  //                             const std::string &parameter,
-  //                             const std::vector<std::string> &arguments) {
-  //             auto ipt = self.add_inputs();
-  //             op_desc_set_var(ipt, parameter, arguments);
-  //           })
-  //      .def("input_names",
-  //           [](OpDesc &self) {
-  //             std::vector<std::string> ret_val;
-  //             ret_val.reserve(static_cast<size_t>(self.inputs().size()));
-  //             std::transform(
-  //                 self.inputs().begin(),
-  //                 self.inputs().end(),
-  //                 std::back_inserter(ret_val),
-  //                 [](const OpDesc::Var &var) { return var.parameter(); });
-  //             return ret_val;
-  //           })
-  //      .def("__str__", [](OpDesc &self) { return self.DebugString(); })
-  //      .def("set_output",
-  //           [op_desc_set_var](OpDesc &self,
-  //                             const std::string &parameter,
-  //                             const std::vector<std::string> &arguments) {
-  //             auto opt = self.add_outputs();
-  //             op_desc_set_var(opt, parameter, arguments);
-  //           })
-  //      .def("set_attr",
-  //           [op_desc_set_attr](OpDesc &self, const std::string &name, int i)
-  //           {
-  //             op_desc_set_attr(self, name)->set_i(i);
-  //           });
+  py::class_<OpDescBind>(m, "OpDesc", "")
+      .def("type", &OpDescBind::Type)
+      .def("set_type", &OpDescBind::SetType)
+      .def("input", &OpDescBind::Input)
+      .def("input_names", &OpDescBind::InputNames)
+      .def("set_input", &OpDescBind::SetInput)
+      .def("output", &OpDescBind::Output)
+      .def("output_names", &OpDescBind::OutputNames)
+      .def("set_output", &OpDescBind::SetOutput)
+      .def("__str__", &OpDescBind::DebugString)
+      .def("__repr__", &OpDescBind::DebugString);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index fbe1f7152b..950a936307 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -2,6 +2,25 @@ import unittest
 import paddle.v2.framework.core as core
 
 
+class TestOpDesc(unittest.TestCase):
+    def test_op_desc(self):
+        prog = core.ProgramDesc.__create_program_desc__()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+        op = block.append_op()
+        self.assertIsNotNone(op)
+        op.set_type("test")
+        self.assertEqual("test", op.type())
+        op.set_input("X", ["a", "b", "c"])
+        self.assertEqual(["a", "b", "c"], op.input("X"))
+        self.assertEqual(["X"], op.input_names())
+
+        op.set_output("Out", ["z"])
+        self.assertEqual(['z'], op.output("Out"))
+        self.assertEqual(["Out"], op.output_names())
+
+
 class TestProgramDesc(unittest.TestCase):
     def test_instance(self):
         program_desc = core.ProgramDesc.instance()

From 08e99006216395ee61f3dad3047dd44316829a66 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 22 Sep 2017 18:29:25 -0700
Subject: [PATCH 25/51] Fix bugs

---
 paddle/pybind/protobuf.cc                      | 18 +++++++++---------
 .../v2/framework/tests/test_protobuf_descs.py  |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 126c2ce1c7..de6db60730 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -50,12 +50,12 @@ public:
 
   VarDesc *Proto() { return &desc_; }
 
-  void SetShape(const vector<int64_t> &dims) {
+  void SetShape(const std::vector<int64_t> &dims) {
     VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims());
   }
 
   void SetDataType(int type_id) {
-    desc_.mutable_lod_tensor()->set_data_type(const_cast<DataType>(type_id));
+    desc_.mutable_lod_tensor()->set_data_type(static_cast<DataType>(type_id));
   }
 
   std::vector<int64_t> Shape() {
@@ -86,7 +86,8 @@ public:
 
   int32_t Parent() const { return desc_->parent_idx(); }
 
-  VarDescBind *NewVar(const std::string &name) {
+  VarDescBind *NewVar(py::bytes name_bytes) {
+    std::string name = name_bytes;
     need_update_ = true;
     auto it = vars_.find(name);
     PADDLE_ENFORCE(it == vars_.end(), "Duplicated variable %s", name);
@@ -224,16 +225,15 @@ void BindBlockDesc(py::module &m) {
            &BlockDescBind::AppendOp,
            py::return_value_policy::reference)
       .def("new_var",
-           [](BlockDesc &self) { return self.add_vars(); },
+           &BlockDescBind::NewVar,
            py::return_value_policy::reference);
 }
 
 void BindVarDsec(py::module &m) {
-  py::class_<VarDesc>(m, "VarDesc", "")
-      .def(py::init<>())
-      .def("set_shape", VarDescBind::SetShape)
-      .def("set_data_type", VarDescBind::SetDataType)
-      .def("shape", VarDescBind::Shape);
+  py::class_<VarDescBind>(m, "VarDesc", "")
+      .def("set_shape", &VarDescBind::SetShape)
+      .def("set_data_type", &VarDescBind::SetDataType)
+      .def("shape", &VarDescBind::Shape);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index fbe1f7152b..f1074f6bb5 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -33,7 +33,7 @@ class TestVarDesc(unittest.TestCase):
     def test_shape(self):
         program_desc = core.ProgramDesc.instance()
         block = program_desc.root_block()
-        var = block.new_var()
+        var = block.new_var('my_var')
         src_shape = [3, 2, 10, 8]
         var.set_shape(src_shape)
         res_shape = var.shape()

From afeb01f7620a2280c946f359065589d8b0d59062 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 22 Sep 2017 18:30:05 -0700
Subject: [PATCH 26/51] Stash

---
 paddle/pybind/protobuf.cc | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 67d6252af8..345bb02c86 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -114,6 +114,14 @@ public:
 
   std::string DebugString() { return this->Proto()->DebugString(); }
 
+  struct SetAttrDescVisitor : public boost::static_visitor<void> {
+    explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
+    OpDesc::Attr *attr_;
+    void operator()(int v) { attr_->set_i(v); }
+    void operator()(float v) { attr_->set_f(v); }
+    void operator()(const std::string &v) { attr_->set_s(v); }
+  };
+
   void Sync() {
     if (need_update_) {
       this->op_desc_.mutable_inputs()->Clear();
@@ -130,6 +138,13 @@ public:
         VectorToRepeated(opt.second, output->mutable_arguments());
       }
 
+      this->op_desc_.mutable_attrs()->Clear();
+      for (auto &attr : attrs_) {
+        auto *attr_desc = op_desc_.add_attrs();
+        attr_desc->set_name(attr.first);
+        attr_desc->set_type(static_cast<AttrType>(attr.second.which() - 1));
+      }
+
       need_update_ = false;
     }
   }

From fdd68fd1a17a78ead01911a8685f02ea871398a5 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 25 Sep 2017 10:14:38 -0700
Subject: [PATCH 27/51] Refine Visitor

---
 paddle/pybind/protobuf.cc | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 23c322ac36..3b2ac68714 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -128,10 +128,24 @@ public:
 
   struct SetAttrDescVisitor : public boost::static_visitor<void> {
     explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-    OpDesc::Attr *attr_;
-    void operator()(int v) { attr_->set_i(v); }
-    void operator()(float v) { attr_->set_f(v); }
-    void operator()(const std::string &v) { attr_->set_s(v); }
+    mutable OpDesc::Attr *attr_;
+    void operator()(int v) const { attr_->set_i(v); }
+    void operator()(float v) const { attr_->set_f(v); }
+    void operator()(const std::string &v) const { attr_->set_s(v); }
+    void operator()(bool b) const { attr_->set_b(b); }
+
+    void operator()(const std::vector<int> &v) const {
+      VectorToRepeated(v, attr_->mutable_ints());
+    }
+    void operator()(const std::vector<float> &v) const {
+      VectorToRepeated(v, attr_->mutable_floats());
+    }
+    void operator()(const std::vector<std::string> &v) const {
+      VectorToRepeated(v, attr_->mutable_strings());
+    }
+    void operator()(const std::vector<bool> &v) const {
+      VectorToRepeated(v, attr_->mutable_bools());
+    }
   };
 
   void Sync() {

From 699dbe3be9fd3020fe44d1401f5d1a492d98e40b Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 25 Sep 2017 11:38:16 -0700
Subject: [PATCH 28/51] Use `bool` for PADDLE_ENFORCE, not int

* If stat is an integer, bool value will implicit cast to int before
  pass to PADDLE_ENFORCE
---
 paddle/platform/enforce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index df5f71ed76..b523ef03c0 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -107,7 +107,7 @@ struct EnforceNotMet : public std::exception {
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    int stat, const Args&... args) {
+    bool stat, const Args&... args) {
   if (UNLIKELY(!(stat))) {
     throw std::runtime_error(string::Sprintf(args...));
   }

From 1bfa0e130a17613c0b0e2108e7d220d4630e928a Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 25 Sep 2017 13:48:13 -0700
Subject: [PATCH 29/51] Add `Prepend` for BlockDescBind

---
 paddle/pybind/protobuf.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index de6db60730..f119a12e98 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -104,6 +104,12 @@ public:
     return ops_.back().get();
   }
 
+  OpDescBind *PrependOp() {
+    need_update_ = true;
+    ops_.emplace_front(new OpDescBind());
+    return ops_.front().get();
+  }
+
   void Sync() {
     if (need_update_) {
       auto &op_field = *this->desc_->mutable_ops();
@@ -224,6 +230,9 @@ void BindBlockDesc(py::module &m) {
       .def("append_op",
            &BlockDescBind::AppendOp,
            py::return_value_policy::reference)
+      .def("prepend_op",
+           &BlockDescBind::PrependOp,
+           py::return_value_policy::reference)
       .def("new_var",
            &BlockDescBind::NewVar,
            py::return_value_policy::reference);

From f9f910a33bb33fd94fd645743518a4711a7e0017 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 25 Sep 2017 16:41:18 -0700
Subject: [PATCH 30/51] Complete op

---
 paddle/pybind/protobuf.cc                     | 150 +++++++++++++++++-
 .../v2/framework/tests/test_protobuf_descs.py |  36 ++++-
 2 files changed, 181 insertions(+), 5 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 7af93cca99..673e0ab80b 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -14,8 +14,72 @@ limitations under the License. */
 
 #include "paddle/pybind/protobuf.h"
 #include <deque>
+#include <iostream>
 #include "paddle/framework/attribute.h"
 
+// Cast boost::variant for PyBind.
+// Copy from
+// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199
+namespace pybind11 {
+namespace detail {
+
+// Can be replaced by a generic lambda in C++14
+struct variant_caster_visitor : public boost::static_visitor<handle> {
+  return_value_policy policy;
+  handle parent;
+
+  variant_caster_visitor(return_value_policy policy, handle parent)
+      : policy(policy), parent(parent) {}
+
+  template <class T>
+  handle operator()(T const &src) const {
+    return make_caster<T>::cast(src, policy, parent);
+  }
+};
+
+template <class Variant>
+struct variant_caster;
+
+template <template <class...> class V, class... Ts>
+struct variant_caster<V<Ts...>> {
+  using Type = V<Ts...>;
+
+  template <class T>
+  bool try_load(handle src, bool convert) {
+    auto caster = make_caster<T>();
+    if (!load_success_ && caster.load(src, convert)) {
+      load_success_ = true;
+      value = cast_op<T>(caster);
+      return true;
+    }
+    return false;
+  }
+
+  bool load(handle src, bool convert) {
+    auto unused = {false, try_load<Ts>(src, convert)...};
+    (void)(unused);
+    return load_success_;
+  }
+
+  static handle cast(Type const &src,
+                     return_value_policy policy,
+                     handle parent) {
+    variant_caster_visitor visitor(policy, parent);
+    return boost::apply_visitor(visitor, src);
+  }
+
+  PYBIND11_TYPE_CASTER(Type, _("Variant"));
+  bool load_success_{false};
+};
+
+// Add specialization for concrete variant type
+template <class... Args>
+struct type_caster<boost::variant<Args...>>
+    : variant_caster<boost::variant<Args...>> {};
+
+}  // namespace detail
+}  // namespace pybind11
+
 namespace paddle {
 namespace pybind {
 
@@ -40,6 +104,15 @@ inline void VectorToRepeated(const std::vector<T> &vec,
   }
 }
 
+template <typename RepeatedField>
+inline void VectorToRepeated(const std::vector<bool> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Reserve(vec.size());
+  for (auto elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
 class ProgramDescBind;
 class OpDescBind;
 class BlockDescBind;
@@ -146,6 +219,10 @@ public:
     void operator()(const std::vector<bool> &v) const {
       VectorToRepeated(v, attr_->mutable_bools());
     }
+    void operator()(BlockDesc *desc) const {
+      attr_->set_block_idx(desc->idx());
+    }
+    void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
   };
 
   void Sync() {
@@ -168,13 +245,52 @@ public:
       for (auto &attr : attrs_) {
         auto *attr_desc = op_desc_.add_attrs();
         attr_desc->set_name(attr.first);
-        attr_desc->set_type(static_cast<AttrType>(attr.second.which() - 1));
+        attr_desc->set_type(
+            static_cast<framework::AttrType>(attr.second.which() - 1));
+        boost::apply_visitor(SetAttrDescVisitor(attr_desc), attr.second);
       }
 
       need_update_ = false;
     }
   }
 
+  bool HasAttr(const std::string &name) const {
+    return attrs_.find(name) != attrs_.end();
+  }
+
+  framework::AttrType GetAttrType(const std::string &name) const {
+    auto it = attrs_.find(name);
+    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+    return static_cast<framework::AttrType>(it->second.which() - 1);
+  }
+
+  std::vector<std::string> AttrNames() const {
+    std::vector<std::string> retv;
+    retv.reserve(attrs_.size());
+    for (auto &attr : attrs_) {
+      retv.push_back(attr.first);
+    }
+    return retv;
+  }
+
+  void SetAttr(const std::string &name, const Attribute &v) {
+    this->attrs_[name] = v;
+  }
+
+  void SetBlockAttr(const std::string &name, BlockDescBind &block);
+
+  int GetBlockAttr(const std::string &name) const {
+    auto it = attrs_.find(name);
+    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+    return boost::get<BlockDesc *>(it->second)->idx();
+  }
+
+  Attribute GetAttr(const std::string &name) const {
+    auto it = attrs_.find(name);
+    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+    return it->second;
+  }
+
 private:
   OpDesc op_desc_;
   std::unordered_map<std::string, std::vector<std::string>> inputs_;
@@ -232,6 +348,8 @@ public:
     }
   }
 
+  BlockDesc *RawPtr() { return desc_; }
+
 private:
   ProgramDescBind *prog_;  // not_own
   BlockDesc *desc_;        // not_own
@@ -303,6 +421,11 @@ BlockDescBind *BlockDescBind::ParentBlock() const {
   return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
 }
 
+void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+  BlockDesc *desc = block.RawPtr();
+  this->attrs_[name] = desc;
+}
+
 void BindProgramDesc(py::module &m) {
   py::class_<ProgramDescBind>(m, "ProgramDesc", "")
       .def_static("instance",
@@ -351,8 +474,19 @@ void BindVarDsec(py::module &m) {
 }
 
 void BindOpDesc(py::module &m) {
-  py::class_<OpDescBind>(m, "OpDesc", "")
-      .def("type", &OpDescBind::Type)
+  py::enum_<framework::AttrType>(m, "AttrType", "")
+      .value("INT", AttrType::INT)
+      .value("INTS", AttrType::INTS)
+      .value("FLOAT", AttrType::FLOAT)
+      .value("FLOATS", AttrType::FLOATS)
+      .value("STRING", AttrType::STRING)
+      .value("STRINGS", AttrType::STRINGS)
+      .value("BOOL", AttrType::BOOLEAN)
+      .value("BOOLS", AttrType::BOOLEANS)
+      .value("BLOCK", AttrType::BLOCK);
+
+  py::class_<OpDescBind> op_desc(m, "OpDesc", "");
+  op_desc.def("type", &OpDescBind::Type)
       .def("set_type", &OpDescBind::SetType)
       .def("input", &OpDescBind::Input)
       .def("input_names", &OpDescBind::InputNames)
@@ -361,7 +495,15 @@ void BindOpDesc(py::module &m) {
       .def("output_names", &OpDescBind::OutputNames)
       .def("set_output", &OpDescBind::SetOutput)
       .def("__str__", &OpDescBind::DebugString)
-      .def("__repr__", &OpDescBind::DebugString);
+      .def("__repr__", &OpDescBind::DebugString)
+      .def("has_attr", &OpDescBind::HasAttr)
+      .def("attr_type", &OpDescBind::GetAttrType)
+      .def("attr_names", &OpDescBind::AttrNames)
+      .def("set_attr", &OpDescBind::SetAttr)
+      .def("attr", &OpDescBind::GetAttr)
+      .def("set_block_attr", &OpDescBind::SetBlockAttr)
+      .def("get_block_attr", &OpDescBind::GetBlockAttr);
 }
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index 2e96dcced5..aa9a0b33ac 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -20,6 +20,40 @@ class TestOpDesc(unittest.TestCase):
         self.assertEqual(['z'], op.output("Out"))
         self.assertEqual(["Out"], op.output_names())
 
+        op.set_attr("int_attr", 1)
+        self.assertEqual(1, op.attr("int_attr"))
+        self.assertTrue(op.has_attr("int_attr"))
+
+        op.set_attr("float_attr", -1.32)
+        self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4)
+        self.assertTrue(op.has_attr("float_attr"))
+
+        op.set_attr("bool_attr", False)
+        self.assertFalse(op.attr("bool_attr"))
+
+        op.set_attr("string_attr", "abc")
+        self.assertEqual("abc", op.attr("string_attr"))
+        self.assertTrue(op.has_attr("string_attr"))
+
+        op.set_attr("ints_attr", [1, 2, 3])
+        self.assertEqual([1, 2, 3], op.attr("ints_attr"))
+
+        expected = [1.2, 2.3, 3.4]
+        op.set_attr("floats_attr", expected)
+        for e, a in zip(expected, op.attr("floats_attr")):
+            self.assertAlmostEqual(e, a, delta=1e-4)
+
+        op.set_attr("strings_attr", ["a", "b", "c"])
+        self.assertEqual(["a", "b", "c"], op.attr("strings_attr"))
+
+        op.set_attr("bools_attr", [True, False, True])
+        self.assertEqual([True, False, True], op.attr("bools_attr"))
+
+        self.assertEqual(8, len(op.attr_names()))
+
+        op.set_block_attr("block_attr", prog.block(0))
+        self.assertEqual(0, op.get_block_attr("block_attr"))
+
 
 class TestProgramDesc(unittest.TestCase):
     def test_instance(self):
@@ -51,7 +85,7 @@ class TestProgramDesc(unittest.TestCase):
 class TestVarDesc(unittest.TestCase):
     def test_shape(self):
         program_desc = core.ProgramDesc.instance()
-        block = program_desc.root_block()
+        block = program_desc.block(0)
         var = block.new_var('my_var')
         src_shape = [3, 2, 10, 8]
         var.set_shape(src_shape)

From 16c5f629bdf162d7cdf81150bb5e3af2a1a087c0 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 25 Sep 2017 16:46:35 -0700
Subject: [PATCH 31/51] Complete unittest for OP

---
 python/paddle/v2/framework/tests/test_protobuf_descs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index aa9a0b33ac..14367e5d59 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -53,6 +53,7 @@ class TestOpDesc(unittest.TestCase):
 
         op.set_block_attr("block_attr", prog.block(0))
         self.assertEqual(0, op.get_block_attr("block_attr"))
+        self.assertEqual(core.AttrType.INT, op.attr_type("int_attr"))
 
 
 class TestProgramDesc(unittest.TestCase):

From 5419f16b38b5275e4d07604a6ef465d522e2e6b8 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 25 Sep 2017 16:55:16 -0700
Subject: [PATCH 32/51] Add unittests

---
 paddle/pybind/protobuf.cc                     | 49 +++++++++++++++++--
 .../v2/framework/tests/test_protobuf_descs.py | 37 +++++++++++++-
 2 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 673e0ab80b..5d5782a6f8 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -99,7 +99,7 @@ template <typename T, typename RepeatedField>
 inline void VectorToRepeated(const std::vector<T> &vec,
                              RepeatedField *repeated_field) {
   repeated_field->Reserve(vec.size());
-  for (auto &elem : vec) {
+  for (const auto &elem : vec) {
     *repeated_field->Add() = elem;
   }
 }
@@ -124,18 +124,23 @@ public:
 
   VarDesc *Proto() { return &desc_; }
 
+  py::bytes Name() { return desc_.name(); }
+
   void SetShape(const std::vector<int64_t> &dims) {
     VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims());
   }
 
   void SetDataType(int type_id) {
-    desc_.mutable_lod_tensor()->set_data_type(static_cast<DataType>(type_id));
+    desc_.mutable_lod_tensor()->set_data_type(
+        static_cast<enum DataType>(type_id));
   }
 
   std::vector<int64_t> Shape() {
     return RepeatedToVector(desc_.lod_tensor().dims());
   }
 
+  int DataType() { return desc_.lod_tensor().data_type(); }
+
 private:
   VarDesc desc_;
 };
@@ -322,6 +327,22 @@ public:
     return var;
   }
 
+  VarDescBind *Var(py::bytes name_bytes) const {
+    std::string name = name_bytes;
+    auto it = vars_.find(name);
+    PADDLE_ENFORCE(
+        it != vars_.end(), "Can not find variable %s in current block.", name);
+    return it->second.get();
+  }
+
+  std::vector<VarDescBind *> AllVars() const {
+    std::vector<VarDescBind *> res;
+    for (const auto &p : vars_) {
+      res.push_back(p.second.get());
+    }
+    return res;
+  }
+
   BlockDescBind *ParentBlock() const;
 
   OpDescBind *AppendOp() {
@@ -336,6 +357,14 @@ public:
     return ops_.front().get();
   }
 
+  std::vector<OpDescBind *> AllOps() const {
+    std::vector<OpDescBind *> res;
+    for (const auto &op : ops_) {
+      res.push_back(op.get());
+    }
+    return res;
+  }
+
   void Sync() {
     if (need_update_) {
       auto &op_field = *this->desc_->mutable_ops();
@@ -461,16 +490,26 @@ void BindBlockDesc(py::module &m) {
       .def("prepend_op",
            &BlockDescBind::PrependOp,
            py::return_value_policy::reference)
-      .def("new_var",
-           &BlockDescBind::NewVar,
+      .def(
+          "new_var", &BlockDescBind::NewVar, py::return_value_policy::reference)
+      .def("var", &BlockDescBind::Var, py::return_value_policy::reference)
+      .def("all_vars",
+           &BlockDescBind::AllVars,
+           py::return_value_policy::reference)
+      .def("all_ops",
+           &BlockDescBind::AllOps,
            py::return_value_policy::reference);
 }
 
 void BindVarDsec(py::module &m) {
   py::class_<VarDescBind>(m, "VarDesc", "")
+      .def("name", &VarDescBind::Name, py::return_value_policy::reference)
       .def("set_shape", &VarDescBind::SetShape)
       .def("set_data_type", &VarDescBind::SetDataType)
-      .def("shape", &VarDescBind::Shape);
+      .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
+      .def("data_type",
+           &VarDescBind::DataType,
+           py::return_value_policy::reference);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index aa9a0b33ac..0dde9729a9 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -57,7 +57,7 @@ class TestOpDesc(unittest.TestCase):
 
 class TestProgramDesc(unittest.TestCase):
     def test_instance(self):
-        program_desc = core.ProgramDesc.instance()
+        program_desc = core.ProgramDesc.__create_program_desc__()
         self.assertIsNotNone(program_desc)
         del program_desc
         program_desc = core.ProgramDesc.instance()
@@ -84,7 +84,7 @@ class TestProgramDesc(unittest.TestCase):
 
 class TestVarDesc(unittest.TestCase):
     def test_shape(self):
-        program_desc = core.ProgramDesc.instance()
+        program_desc = core.ProgramDesc.__create_program_desc__()
         block = program_desc.block(0)
         var = block.new_var('my_var')
         src_shape = [3, 2, 10, 8]
@@ -92,6 +92,39 @@ class TestVarDesc(unittest.TestCase):
         res_shape = var.shape()
         self.assertEqual(src_shape, res_shape)
 
+    def test_data_type(self):
+        program_desc = core.ProgramDesc.__create_program_desc__()
+        block = program_desc.block(0)
+        var = block.new_var('my_var')
+        var.set_data_type(2)
+        self.assertEqual(2, var.data_type)
+
+
+class TestBlockDesc(unittest.TestCase):
+    def test_add_var(self):
+        prog = core.ProgramDesc.__create_program_desc__()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+        var1 = block.new_var("var1")
+        var2 = block.new_var("var2")
+        var3 = block.new_var("var3")
+        all_vars = block.all_vars()
+        self.assertEqual(set(all_vars), set([var1, var2, var3]))
+        var2_re = block.var("var2")
+        self.assertEqual(var2_re, var2)
+
+    def test_add_op(self):
+        prog = core.ProgramDesc.__create_program_desc__()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+        op1 = block.append_op()
+        op2 = block.append_op()
+        op0 = block.prepend_op()
+        all_ops = block.all_ops()
+        self.assertEqual(all_ops, [op0, op1, op2])
+
 
 if __name__ == '__main__':
     unittest.main()

From 6915c924a4922b2c92c7584e6e15a6c3ee45d945 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 25 Sep 2017 17:22:29 -0700
Subject: [PATCH 33/51] Fix bug

---
 paddle/pybind/protobuf.cc                     | 20 ++++++++++++-------
 .../v2/framework/tests/test_protobuf_descs.py |  4 ++--
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 5d5782a6f8..3388b5cfdc 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -130,16 +130,15 @@ public:
     VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims());
   }
 
-  void SetDataType(int type_id) {
-    desc_.mutable_lod_tensor()->set_data_type(
-        static_cast<enum DataType>(type_id));
+  void SetDataType(framework::DataType data_type) {
+    desc_.mutable_lod_tensor()->set_data_type(data_type);
   }
 
   std::vector<int64_t> Shape() {
     return RepeatedToVector(desc_.lod_tensor().dims());
   }
 
-  int DataType() { return desc_.lod_tensor().data_type(); }
+  framework::DataType DataType() { return desc_.lod_tensor().data_type(); }
 
 private:
   VarDesc desc_;
@@ -502,14 +501,21 @@ void BindBlockDesc(py::module &m) {
 }
 
 void BindVarDsec(py::module &m) {
+  py::enum_<framework::DataType>(m, "DataType", "")
+      .value("BOOL", DataType::BOOL)
+      .value("INT16", DataType::INT16)
+      .value("INT32", DataType::INT32)
+      .value("INT64", DataType::INT64)
+      .value("FP16", DataType::FP16)
+      .value("FP32", DataType::FP32)
+      .value("FP64", DataType::FP64);
+
   py::class_<VarDescBind>(m, "VarDesc", "")
       .def("name", &VarDescBind::Name, py::return_value_policy::reference)
       .def("set_shape", &VarDescBind::SetShape)
       .def("set_data_type", &VarDescBind::SetDataType)
       .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
-      .def("data_type",
-           &VarDescBind::DataType,
-           py::return_value_policy::reference);
+      .def("data_type", &VarDescBind::DataType);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index 8bb50cbbd1..13d819abf4 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -97,8 +97,8 @@ class TestVarDesc(unittest.TestCase):
         program_desc = core.ProgramDesc.__create_program_desc__()
         block = program_desc.block(0)
         var = block.new_var('my_var')
-        var.set_data_type(2)
-        self.assertEqual(2, var.data_type)
+        var.set_data_type(core.DataType.INT32)
+        self.assertEqual(core.DataType.INT32, var.data_type())
 
 
 class TestBlockDesc(unittest.TestCase):

From 8b8ad6b1640aaeebcab852d776cb14f9f8ce565a Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Mon, 25 Sep 2017 09:56:32 +0800
Subject: [PATCH 34/51] fix implementations of supporting soft labels.

---
 paddle/operators/cross_entropy_op.cu          |   4 +-
 paddle/operators/cross_entropy_op.h           |  24 +--
 paddle/operators/math/softmax.cc              |   2 +-
 paddle/operators/math/softmax.h               |   4 +-
 paddle/operators/softmax_op.h                 |   2 +-
 .../softmax_with_cross_entropy_op.cc          |  79 +++++++---
 .../softmax_with_cross_entropy_op.cu          | 142 +++++++++++++-----
 .../operators/softmax_with_cross_entropy_op.h |  69 ++++++---
 .../test_softmax_with_cross_entropy_op.py     |  48 +++++-
 9 files changed, 272 insertions(+), 102 deletions(-)

diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 1d6361a814..2989e55075 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -28,7 +28,7 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -tolerable_value(log(X[i * D + label[i]]));
+    Y[i] = -TolerableValue<T>()(log(X[i * D + label[i]]));
   }
 }
 
@@ -39,7 +39,7 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
        i += blockDim.x * gridDim.x) {
     T sum = static_cast<T>(0);
     for (int j = 0; j < D; j++) {
-      sum += label[i * D + j] * tolerable_value(log(X[i * D + j]));
+      sum += label[i * D + j] * TolerableValue<T>()(log(X[i * D + j]));
     }
     Y[i] = -sum;
   }
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 69caba5ff3..942a532f64 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -22,17 +22,16 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T>
-HOSTDEVICE T tolerable_value(const T x) {
-  PADDLE_ASSERT(std::is_floating_point<T>::value);
-  const T kApproInf = 1e20;
-  if (x == INFINITY) {
-    return kApproInf;
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
   }
-  if (x == -INFINITY) {
-    return -kApproInf;
-  }
-  return x;
-}
+};
 
 template <typename T>
 class CrossEntropyOpKernel : public framework::OpKernel {
@@ -57,7 +56,8 @@ class CrossEntropyOpKernel : public framework::OpKernel {
       for (int i = 0; i < batch_size; ++i) {
         T sum = static_cast<T>(0);
         for (int j = 0; j < class_num; ++j) {
-          sum += label_data[index] * tolerable_value(std::log(x_data[index]));
+          sum +=
+              label_data[index] * TolerableValue<T>()(std::log(x_data[index]));
           y_data[i] = -sum;
           index++;
         }
@@ -66,7 +66,7 @@ class CrossEntropyOpKernel : public framework::OpKernel {
       auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
       for (int i = 0; i < batch_size; ++i) {
         int index = i * class_num + label_data[i];
-        y_data[i] = -tolerable_value(std::log(x_data[index]));
+        y_data[i] = -TolerableValue<T>()(std::log(x_data[index]));
       }
     }
   }
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
index ac9f3c4bf6..1224c05810 100644
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::CPUPlace, float>;
+template class SoftmaxFunctor<platform::GPUPlace, float>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
index ce29a69bce..08dafed971 100644
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
@@ -28,8 +28,8 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
 class SoftmaxFunctor {
  public:
-  void operator()(const framework::Tensor* X, framework::Tensor* Y,
-                  const framework::ExecutionContext& context) {
+  void operator()(const framework::ExecutionContext& context,
+                  const framework::Tensor* X, framework::Tensor* Y) {
     auto logits = EigenMatrix<T>::From(*X);
     auto softmax = EigenMatrix<T>::From(*Y);
 
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 18494e470a..7220f486be 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -35,7 +35,7 @@ class SoftmaxKernel : public framework::OpKernel {
     // allocate memory on device.
     Y->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<Place, T>()(X, Y, context);
+    math::SoftmaxFunctor<Place, T>()(context, X, Y);
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 3dd21279ad..cb2aa30055 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -23,31 +23,31 @@ class SoftmaxWithCrossEntropyOpMaker
   SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
                                  framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    //(TODO caoying) replace int with boolean
-    AddAttr<int>("soft_label",
-                 "(int, default 0), A flag to indicate whether to interpretate "
-                 "the given labels as soft labels.")
-        .SetDefault(0);
+    AddAttr<bool>(
+        "softLabel",
+        "(bool, default: false), A flag to indicate whether to interpretate "
+        "the given labels as soft labels.")
+        .SetDefault(false);
     AddInput("Logits",
-             "(Tensor, default Tensor<float>), The unscaled log probabilities "
+             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
              "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
              "and K is the class number.")
         .NotInGradient();
     AddInput(
         "Label",
-        "(Tensor, default Tensor<int>), The ground truth which is "
-        "a 1-D or 2-D tensor. "
-        "If soft_label is set to 0, Label is a Tensor<int> with shape [N x 1]. "
-        "If soft_label is set to 1, Label is a Tensor<float/double> "
+        "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
+        "tensor. "
+        "If softLable is set to 0, Label is a Tensor<int> with shape [N x 1]. "
+        "If softLable is set to 1, Label is a Tensor<float/double> "
         "with shape [N x K].");
     AddOutput(
         "Softmax",
-        "(Tensor, default Tensor<float>), A 2-D tensor with shape [N x K]. "
+        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
         "The outputs value of softmax activation by given the input batch, "
         "which will be used in backward calculation.")
         .AsIntermediate();
     AddOutput("Loss",
-              "(Tensor, default Tensor<float>), A 1-D tensor. The cross "
+              "(Tensor, default: Tensor<float>), A 2-D tensor. The cross "
               "entropy loss with shape [N x 1].");
     AddComment(R"DOC(
 Cross entropy loss with softmax are used as the output layer extensively. This
@@ -83,15 +83,39 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Logits"),
+                            "Input(Logits) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) should be not null.");
+
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Softmax"),
+                            "Output(Softmax) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Loss"),
+                            "Output(Loss) should be not null.");
+
     const Tensor* logits = ctx.Input<Tensor>("Logits");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
     PADDLE_ENFORCE(
         logits->dims().size() == 2UL,
-        "The input of softmax_with_cross_entropy should be a 2-d tensor.");
-    PADDLE_ENFORCE(ctx.Input<Tensor>("Label")->dims().size() == 1UL,
-                   "The label should be a 1-d tensor.");
-
-    ctx.Output<framework::LoDTensor>("Softmax")->Resize(logits->dims());
-    ctx.Output<framework::LoDTensor>("Loss")->Resize({logits->dims()[0], 1});
+        "The input of softmax_with_cross_entropy should be a 2-D tensor.");
+    PADDLE_ENFORCE(ctx.Input<Tensor>("Label")->dims().size() == 2UL,
+                   "The labels should be a 2-D tensor.");
+
+    if (ctx.Attr<bool>("softLabel")) {
+      PADDLE_ENFORCE_EQ(logits->dims()[1], labels->dims()[1],
+                        "If Attr(softLabel) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(labels->dims()[1], 1,
+                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+
+    ctx.Output<framework::Tensor>("Softmax")->Resize(logits->dims());
+    ctx.Output<framework::Tensor>("Loss")->Resize({logits->dims()[0], 1});
+
+    ctx.ShareLoD("Logits", /*->*/ "Softmax");
+    ctx.ShareLoD("Logits", /*->*/ "Loss");
   }
 };
 
@@ -102,11 +126,28 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext& ctx) const override {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Loss")),
-                            "Input(Loss@Grad) should not be null");
+                            "Input(Loss@Grad) should not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Softmax"),
                             "Input(Softmax) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
                             "Input(Lable) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(framework::GradVarName("Logits")),
+                            "Output(Logits@Grad) should be not null.");
+
+    const Tensor* softmax = ctx.Input<Tensor>("Softmax");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    PADDLE_ENFORCE(ctx.Input<Tensor>("Label")->dims().size() == 2UL,
+                   "The labels should be a 2-D tensor.");
+
+    if (ctx.Attr<bool>("softLabel")) {
+      PADDLE_ENFORCE_EQ(softmax->dims()[1], labels->dims()[1],
+                        "When Attr(softLabel) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(labels->dims()[1], 1,
+                        "When Attr(softLabel) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
 
     ctx.Output<framework::LoDTensor>(framework::GradVarName("Logits"))
         ->Resize(ctx.Input<Tensor>("Softmax")->dims());
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index 68bb85fa8a..feae903dab 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -24,25 +24,78 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T>
-__global__ void CrossEntropyKernel(T* out, const T* softmax_out,
-                                   const int* label, const int batch_size,
-                                   const int class_num) {
+__global__ void CrossEntropy(T* out, const T* softmax_out, const int* labels,
+                             const int batch_size, const int class_num) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < batch_size) {
-    PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
-    out[i] = -tolerable_value(std::log(softmax_out[i * class_num + label[i]]));
+    PADDLE_ASSERT(labels[i] >= 0 && labels[i] < class_num);
+    out[i] =
+        -TolerableValue<T>()(std::log(softmax_out[i * class_num + labels[i]]));
   }
 }
 
 template <typename T>
-__global__ void CrossEntropyWithSoftmaxGradKernel(T* softmax_out,
-                                                  const int* label,
-                                                  const int batch_size,
-                                                  const int class_num) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < batch_size) {
-    PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
-    softmax_out[i * class_num + label[i]] -= 1.;
+__global__ void CrossEntropyGrad(T* out_grad, const T* in_grad,
+                                 const int* labels, const int batch_size,
+                                 const int class_num) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int sample_idx = tid / class_num;
+
+  if (tid < batch_size * class_num) out_grad[tid] *= in_grad[sample_idx];
+  __syncthreads();
+
+  if (tid < batch_size) {
+    PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num);
+    out_grad[tid * class_num + labels[tid]] -= 1.;
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T sum_single_warp(T val) {
+  val += __shfl_down(val, 16);
+  val += __shfl_down(val, 8);
+  val += __shfl_down(val, 4);
+  val += __shfl_down(val, 2);
+  val += __shfl_down(val, 1);
+  return val;
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
+                                       const int class_num) {
+  int tid = threadIdx.x;
+  extern __shared__ T d_sum[];
+  d_sum[tid] = 0;
+
+  int cur_idx = tid;
+  int next_idx = blockIdx.x * class_num + tid;
+  while (cur_idx < class_num) {
+    d_sum[tid] += TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
+    next_idx += blockDim.x;
+    cur_idx += blockDim.x;
+  }
+  __syncthreads();
+
+  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
+    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
+    __syncthreads();
+  }
+
+  T val = d_sum[tid];
+  val = sum_single_warp<T>(val);
+  if (tid == 0) Y[blockIdx.x] = -val;
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
+                                               const T* loss_grad,
+                                               const T* labels,
+                                               const int batch_size,
+                                               const int class_num) {
+  int ids = blockIdx.x * blockDim.x + threadIdx.x;
+  if (ids < batch_size * class_num) {
+    int row_ids = ids / class_num;
+    logit_grad[ids] = logit_grad[ids] * loss_grad[row_ids] - labels[ids];
   }
 }
 
@@ -52,27 +105,36 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
                    "This kernel only runs on GPU device.");
+    T* loss_data =
+        context.Output<Tensor>("Loss")->mutable_data<T>(context.GetPlace());
 
-    // Calculate ths softmax outputs.
     const Tensor* logits = context.Input<Tensor>("Logits");
     Tensor* softmax = context.Output<Tensor>("Softmax");
-    softmax->mutable_data<T>(context.GetPlace());
-    math::SoftmaxFunctor<platform::GPUPlace, T>()(logits, softmax, context);
-    T* softmax_out = softmax->data<T>();
-
-    // Calculate the cross entropy loss based on hard labels.
-    const int* label_data = context.Input<Tensor>("Label")->data<int>();
-    Tensor* loss = context.Output<Tensor>("Loss");
-    loss->mutable_data<T>(context.GetPlace());
-    T* loss_data = loss->data<T>();
+    T* softmax_out = softmax->mutable_data<T>(context.GetPlace());
+    math::SoftmaxFunctor<platform::GPUPlace, T>()(context, logits, softmax);
 
     const int batch_size = logits->dims()[0];
     const int class_num = logits->dims()[1];
     int block = 512;
     int grid = (batch_size + block - 1) / block;
 
-    CrossEntropyKernel<T><<<grid, block>>>(loss_data, softmax_out, label_data,
-                                           batch_size, class_num);
+    if (context.Attr<bool>("softLabel")) {
+      const T* label_data = context.Input<Tensor>("Label")->data<T>();
+      block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
+
+      SoftCrossEntropyKernel<
+          T><<<batch_size, block, block * sizeof(T),
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   context.device_context())
+                   .stream()>>>(loss_data, softmax_out, label_data, class_num);
+    } else {
+      const int* label_data = context.Input<Tensor>("Label")->data<int>();
+      CrossEntropy<T><<<grid, block, 0,
+                        reinterpret_cast<const platform::CUDADeviceContext&>(
+                            context.device_context())
+                            .stream()>>>(loss_data, softmax_out, label_data,
+                                         batch_size, class_num);
+    }
   }
 };
 
@@ -82,7 +144,9 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
                    "This kernel only runs on GPU device.");
-
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const T* loss_grad_data =
+        context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
     logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
@@ -90,14 +154,24 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel {
 
     const int batch_size = logit_grad->dims()[0];
     const int class_num = logit_grad->dims()[1];
-
-    const int* label_data = context.Input<Tensor>("Label")->data<int>();
-
-    const int block = 512;
-    const int grid = (batch_size + block - 1) / block;
-
-    CrossEntropyWithSoftmaxGradKernel<T><<<grid, block>>>(
-        logit_grad_data, label_data, batch_size, class_num);
+    int block = 512;
+    int grid = (batch_size * class_num + block - 1) / block;
+
+    if (context.Attr<bool>("softLabel")) {
+      const T* label_data = labels->data<T>();
+      SoftCrossEntropyGradientKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              context.device_context())
+                              .stream()>>>(logit_grad_data, loss_grad_data,
+                                           label_data, batch_size, class_num);
+    } else {
+      const int* label_data = labels->data<int>();
+      CrossEntropyGrad<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              context.device_context())
+                              .stream()>>>(logit_grad_data, loss_grad_data,
+                                           label_data, batch_size, class_num);
+    }
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index 0ad48dae2c..71705cedf2 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -32,28 +32,35 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
                    "This kernel only runs on CPU.");
-
-    // Calculate ths softmax outputs.
     const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* softmax = context.Output<Tensor>("Softmax");
-    softmax->mutable_data<T>(context.GetPlace());
-
-    math::SoftmaxFunctor<platform::CPUPlace, T>()(logits, softmax, context);
+    Tensor* loss = context.Output<Tensor>("Loss");
 
-    // Calculate the cross entropy loss based on hard labels.
-    T* softmax_out = softmax->data<T>();
-    const int* label_data = context.Input<Tensor>("Label")->data<int>();
+    T* softmax_data = softmax->mutable_data<T>(context.GetPlace());
+    T* loss_data = loss->mutable_data<T>(context.GetPlace());
 
-    Tensor* loss = context.Output<Tensor>("Loss");
-    loss->mutable_data<T>(context.GetPlace());
-    T* loss_data = loss->data<T>();
+    math::SoftmaxFunctor<platform::CPUPlace, T>()(context, logits, softmax);
 
     const int batch_size = logits->dims()[0];
-    const int class_num = logits->dims()[1];
-
-    for (int i = 0; i < batch_size; ++i) {
-      int index = i * class_num + label_data[i];
-      loss_data[i] = -tolerable_value(std::log(softmax_out[index]));
+    if (context.Attr<bool>("softLabel")) {
+      //(TODO caoying) the forward implementation can be further optimized.
+      // Current implementation is exactly cross entropy after softmax.
+      auto prob = EigenMatrix<T>::From(*softmax);
+      auto lbl_mat = EigenMatrix<T>::From(*labels);
+      auto loss_mat = EigenMatrix<T>::From(*loss);
+
+      loss_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+          -((lbl_mat * prob.log().unaryExpr(TolerableValue<T>()))
+                .sum(Eigen::DSizes<int, 1>(1))
+                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
+    } else {
+      const int* label_data = labels->data<int>();
+      const int class_num = logits->dims()[1];
+
+      for (int i = 0; i < batch_size; ++i)
+        loss_data[i] = -TolerableValue<T>()(
+            std::log(softmax_data[i * class_num + label_data[i]]));
     }
   }
 };
@@ -62,18 +69,34 @@ template <typename T>
 class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Loss"));
+    const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
     logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
-    T* logit_grad_data = logit_grad->data<T>();
 
-    const int batch_size = logit_grad->dims()[0];
     const int class_num = logit_grad->dims()[1];
-
-    const int* label_data = context.Input<Tensor>("Label")->data<int>();
-    for (int i = 0; i < batch_size; ++i) {
-      int index = i * class_num + label_data[i];
-      logit_grad_data[index] -= 1.;
+    if (context.Attr<bool>("softLabel")) {
+      auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+      auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+      auto lbl_mat = EigenMatrix<T>::From(*labels);
+
+      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+          logit_grad_mat *
+              out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
+          lbl_mat;
+    } else {
+      const int batch_size = logit_grad->dims()[0];
+      const int* label_data = labels->data<int>();
+      const T* out_grad_data = out_grad->data<T>();
+      T* logit_grad_data = logit_grad->data<T>();
+
+      for (int i = 0; i < batch_size; ++i) {
+        int index = i * class_num + label_data[i];
+        logit_grad_data[index] =
+            (out_grad_data[i] * logit_grad_data[index] - 1.);
+      }
     }
   }
 };
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
index 9c9ee77b73..428395b76c 100644
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -6,22 +6,23 @@ from test_softmax_op import stable_softmax
 
 
 class TestSoftmaxWithCrossEntropyOp(OpTest):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    """
+
     def setUp(self):
         self.op_type = "softmax_with_cross_entropy"
-
-        MAX_BATCH_SIZE = 23
-        MAX_CLASS_NUM = 17
-
-        batch_size = np.random.randint(1, MAX_BATCH_SIZE, 1)[0]
-        class_num = np.random.randint(2, MAX_CLASS_NUM, 1)[0]
+        batch_size = 3
+        class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
                                    [batch_size, class_num]).astype("float32")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        labels = np.random.randint(0, class_num, batch_size, dtype="int32")
+        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int32")
 
         cross_entropy = np.asmatrix(
-            [[-np.log(softmax[i][labels[i]])] for i in range(softmax.shape[0])],
+            [[-np.log(softmax[i][labels[i][0]])]
+             for i in range(softmax.shape[0])],
             dtype="float32")
 
         self.inputs = {"Logits": logits, "Label": labels}
@@ -34,5 +35,36 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
 
 
+class TestSoftmaxWithCrossEntropyOp2(OpTest):
+    """
+    Test softmax with cross entropy operator with soft labels.
+    """
+
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = 2
+        class_num = 17
+
+        logits = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        labels = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float32")
+        labels /= np.sum(labels, axis=1, keepdims=True)
+
+        cross_entropy = (-labels * np.log(softmax)).sum(
+            axis=1, keepdims=True).astype("float32")
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {"Softmax": softmax, "Loss": cross_entropy}
+        self.attrs = {"softLabel": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+
+
 if __name__ == "__main__":
     unittest.main()

From 3d77360b8978c512900cde3bacc05444785beac9 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Tue, 26 Sep 2017 17:49:10 +0800
Subject: [PATCH 35/51] add negative clipping for softmax.

---
 paddle/operators/math/softmax.h                     | 11 ++++++++++-
 python/paddle/v2/framework/tests/test_softmax_op.py |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
index 08dafed971..3d2f0d0aec 100644
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
@@ -25,6 +25,14 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = -64.;
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
 template <typename Place, typename T>
 class SoftmaxFunctor {
  public:
@@ -47,7 +55,8 @@ class SoftmaxFunctor {
                            logits.maximum(along_class)
                                .eval()
                                .reshape(batch_by_one)
-                               .broadcast(one_by_class));
+                               .broadcast(one_by_class))
+                              .unaryExpr(ValueClip<T>());
 
     softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
     softmax.device(context.GetEigenDevice<Place>()) =
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py
index 1b948f252f..b41c810d9a 100644
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
@@ -5,7 +5,7 @@ from op_test import OpTest
 
 def stable_softmax(x):
     """Compute the softmax of vector x in a numerically stable way."""
-    shiftx = x - np.max(x)
+    shiftx = x - np.max(x).clip(-64.)
     exps = np.exp(shiftx)
     return exps / np.sum(exps)
 

From 40aee48a8077ba2757d8c7905d9744e2b425bca4 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Tue, 26 Sep 2017 18:34:43 +0800
Subject: [PATCH 36/51] follow comments.

---
 .../softmax_with_cross_entropy_op.cc          | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index cb2aa30055..b6f33ad9e0 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -23,11 +23,6 @@ class SoftmaxWithCrossEntropyOpMaker
   SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
                                  framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<bool>(
-        "softLabel",
-        "(bool, default: false), A flag to indicate whether to interpretate "
-        "the given labels as soft labels.")
-        .SetDefault(false);
     AddInput("Logits",
              "(Tensor, default: Tensor<float>), The unscaled log probabilities "
              "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
@@ -49,6 +44,11 @@ class SoftmaxWithCrossEntropyOpMaker
     AddOutput("Loss",
               "(Tensor, default: Tensor<float>), A 2-D tensor. The cross "
               "entropy loss with shape [N x 1].");
+    AddAttr<bool>(
+        "softLabel",
+        "(bool, default: false), A flag to indicate whether to interpretate "
+        "the given labels as soft labels.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Cross entropy loss with softmax are used as the output layer extensively. This
 operator computes the softmax normalized values for each row of the input
@@ -95,18 +95,18 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
 
     const Tensor* logits = ctx.Input<Tensor>("Logits");
     const Tensor* labels = ctx.Input<Tensor>("Label");
-    PADDLE_ENFORCE(
-        logits->dims().size() == 2UL,
+    PADDLE_ENFORCE_EQ(
+        logits->dims().size(), 2UL,
         "The input of softmax_with_cross_entropy should be a 2-D tensor.");
-    PADDLE_ENFORCE(ctx.Input<Tensor>("Label")->dims().size() == 2UL,
-                   "The labels should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Label")->dims().size(), 2UL,
+                      "The labels should be a 2-D tensor.");
 
     if (ctx.Attr<bool>("softLabel")) {
       PADDLE_ENFORCE_EQ(logits->dims()[1], labels->dims()[1],
                         "If Attr(softLabel) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
-      PADDLE_ENFORCE_EQ(labels->dims()[1], 1,
+      PADDLE_ENFORCE_EQ(labels->dims()[1], 1UL,
                         "If Attr(softLabel) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
@@ -130,21 +130,21 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Softmax"),
                             "Input(Softmax) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Lable) should be not null.");
+                            "Input(Label) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(framework::GradVarName("Logits")),
                             "Output(Logits@Grad) should be not null.");
 
     const Tensor* softmax = ctx.Input<Tensor>("Softmax");
     const Tensor* labels = ctx.Input<Tensor>("Label");
-    PADDLE_ENFORCE(ctx.Input<Tensor>("Label")->dims().size() == 2UL,
-                   "The labels should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Label")->dims().size(), 2UL,
+                      "The labels should be a 2-D tensor.");
 
     if (ctx.Attr<bool>("softLabel")) {
       PADDLE_ENFORCE_EQ(softmax->dims()[1], labels->dims()[1],
                         "When Attr(softLabel) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
-      PADDLE_ENFORCE_EQ(labels->dims()[1], 1,
+      PADDLE_ENFORCE_EQ(labels->dims()[1], 1UL,
                         "When Attr(softLabel) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }

From b4a4ae1b652852447ac2dee59e79c320ec7dc51b Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 26 Sep 2017 10:47:43 -0700
Subject: [PATCH 37/51] Add comments

---
 paddle/pybind/protobuf.cc | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 3388b5cfdc..0a6020d649 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -85,6 +85,7 @@ namespace pybind {
 
 using namespace paddle::framework;  // NOLINT
 
+// convert between std::vector and protobuf repeated.
 template <typename T>
 inline std::vector<T> RepeatedToVector(
     const google::protobuf::RepeatedField<T> &repeated_field) {
@@ -104,6 +105,7 @@ inline void VectorToRepeated(const std::vector<T> &vec,
   }
 }
 
+// Specialize vector<bool>.
 template <typename RepeatedField>
 inline void VectorToRepeated(const std::vector<bool> &vec,
                              RepeatedField *repeated_field) {
@@ -118,13 +120,16 @@ class OpDescBind;
 class BlockDescBind;
 class VarDescBind;
 
+// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
+// read/write speed. Only when we want the protobuf message, the local changes
+// will be synchronized (by `Sync` method).
 class VarDescBind {
 public:
   explicit VarDescBind(const std::string &name) { desc_.set_name(name); }
 
   VarDesc *Proto() { return &desc_; }
 
-  py::bytes Name() { return desc_.name(); }
+  py::bytes Name() const { return desc_.name(); }
 
   void SetShape(const std::vector<int64_t> &dims) {
     VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims());
@@ -134,11 +139,13 @@ public:
     desc_.mutable_lod_tensor()->set_data_type(data_type);
   }
 
-  std::vector<int64_t> Shape() {
+  std::vector<int64_t> Shape() const {
     return RepeatedToVector(desc_.lod_tensor().dims());
   }
 
-  framework::DataType DataType() { return desc_.lod_tensor().data_type(); }
+  framework::DataType DataType() const {
+    return desc_.lod_tensor().data_type();
+  }
 
 private:
   VarDesc desc_;
@@ -283,16 +290,16 @@ public:
 
   void SetBlockAttr(const std::string &name, BlockDescBind &block);
 
-  int GetBlockAttr(const std::string &name) const {
+  Attribute GetAttr(const std::string &name) const {
     auto it = attrs_.find(name);
     PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-    return boost::get<BlockDesc *>(it->second)->idx();
+    return it->second;
   }
 
-  Attribute GetAttr(const std::string &name) const {
+  int GetBlockAttr(const std::string &name) const {
     auto it = attrs_.find(name);
     PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-    return it->second;
+    return boost::get<BlockDesc *>(it->second)->idx();
   }
 
 private:
@@ -312,7 +319,7 @@ public:
   BlockDescBind(const BlockDescBind &o) = delete;
   BlockDescBind &operator=(const BlockDescBind &o) = delete;
 
-  int32_t id() const { return desc_->idx(); }
+  int32_t ID() const { return desc_->idx(); }
 
   int32_t Parent() const { return desc_->parent_idx(); }
 
@@ -410,7 +417,7 @@ public:
 
   BlockDescBind *AppendBlock(const BlockDescBind &parent) {
     auto *b = prog_->add_blocks();
-    b->set_parent_idx(parent.id());
+    b->set_parent_idx(parent.ID());
     b->set_idx(prog_->blocks_size() - 1);
     blocks_.emplace_back(new BlockDescBind(this, b));
     return blocks_.back().get();
@@ -454,6 +461,7 @@ void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
   this->attrs_[name] = desc;
 }
 
+// Bind Methods
 void BindProgramDesc(py::module &m) {
   py::class_<ProgramDescBind>(m, "ProgramDesc", "")
       .def_static("instance",
@@ -481,7 +489,7 @@ void BindProgramDesc(py::module &m) {
 
 void BindBlockDesc(py::module &m) {
   py::class_<BlockDescBind>(m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDescBind::id)
+      .def_property_readonly("id", &BlockDescBind::ID)
       .def_property_readonly("parent", &BlockDescBind::Parent)
       .def("append_op",
            &BlockDescBind::AppendOp,

From d72a140e507ebb83a2f5041b51aee587ff7530fe Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 26 Sep 2017 15:34:17 -0700
Subject: [PATCH 38/51] Fix CI test

---
 paddle/pybind/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index a1d7483973..aa9ca4e31a 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,7 +1,6 @@
 if(WITH_PYTHON)
-  cc_library(proto_bind SRCS protobuf.cc)
   cc_library(paddle_pybind SHARED
-    SRCS pybind.cc
-    DEPS proto_bind pybind python backward
+    SRCS pybind.cc protobuf.cc
+    DEPS pybind python backward
     ${GLOB_OP_LIB})
 endif(WITH_PYTHON)

From 36f3d0af220a7ad3cbcd431f5b00463e4fab7630 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 26 Sep 2017 17:24:59 -0700
Subject: [PATCH 39/51] Fix error in unit test of ModifiedHuberLossOp

---
 .../tests/test_modified_huber_loss_op.py      | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py b/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
index a7e2b57529..18a6e9e8a4 100644
--- a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
@@ -5,22 +5,31 @@ from op_test import OpTest
 
 def modified_huber_loss_forward(val):
     if val < -1:
-        return -4 * val
+        return -4. * val
     elif val < 1:
-        return (1 - val) * (1 - val)
+        return (1. - val) * (1. - val)
     else:
-        return 0
+        return 0.
 
 
 class TestModifiedHuberLossOp(OpTest):
     def setUp(self):
         self.op_type = 'modified_huber_loss'
         samples_num = 32
-        self.inputs = {
-            'X': np.random.uniform(-1, 1., (samples_num, 1)).astype('float32'),
-            'Y': np.random.choice([0, 1], samples_num).reshape((samples_num, 1))
-        }
-        product_res = self.inputs['X'] * (2 * self.inputs['Y'] - 1)
+
+        x_np = np.random.uniform(-2., 2., (samples_num, 1)).astype('float32')
+        y_np = np.random.choice([0, 1], samples_num).reshape(
+            (samples_num, 1)).astype('float32')
+        product_res = x_np * (2. * y_np - 1.)
+        # keep away from the junction of piecewise function
+        for pos, val in np.ndenumerate(product_res):
+            while abs(val - 1.) < 0.05:
+                x_np[pos] = np.random.uniform(-2., 2.)
+                y_np[pos] = np.random.choice([0, 1])
+                product_res[pos] = x_np[pos] * (2 * y_np[pos] - 1)
+                val = product_res[pos]
+
+        self.inputs = {'X': x_np, 'Y': y_np}
         loss = np.vectorize(modified_huber_loss_forward)(product_res)
 
         self.outputs = {
@@ -32,7 +41,7 @@ class TestModifiedHuberLossOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
 if __name__ == '__main__':

From 97509b68e7e4a42ec128a821d8e1ca802231e359 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Tue, 26 Sep 2017 23:46:00 +0800
Subject: [PATCH 40/51] cross entropy as a functor to avoid duplicated codes.

---
 paddle/operators/CMakeLists.txt               |   6 +-
 paddle/operators/cross_entropy_op.cu          |  83 +------------
 paddle/operators/cross_entropy_op.h           |  38 +-----
 paddle/operators/math/CMakeLists.txt          |   3 +
 paddle/operators/math/cross_entropy.cc        |  59 ++++++++++
 paddle/operators/math/cross_entropy.cu        | 111 ++++++++++++++++++
 paddle/operators/math/cross_entropy.h         |  48 ++++++++
 .../softmax_with_cross_entropy_op.cu          |  86 ++------------
 .../operators/softmax_with_cross_entropy_op.h |  29 +----
 9 files changed, 251 insertions(+), 212 deletions(-)
 create mode 100644 paddle/operators/math/cross_entropy.cc
 create mode 100644 paddle/operators/math/cross_entropy.cu
 create mode 100644 paddle/operators/math/cross_entropy.h

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index f8b0bce681..e56895c63a 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -88,10 +88,14 @@ add_subdirectory(math)
 
 set(DEPS_OPS
     recurrent_op
-    cond_op)
+    cond_op
+    cross_entropy_op
+    softmax_with_cross_entropy_op)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor net_op)
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
+op_library(cross_entropy_op DEPS cross_entropy_function)
+op_library(softmax_with_cross_entropy_op DEPS cross_entropy_function softmax_function)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 18e44d77c9..1cfeb7a53b 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -12,62 +12,12 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/cross_entropy_op.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
-                                   const int N, const int D) {
-  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
-  // CUDA_1D_KERNEL_LOOP(i, N) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -TolerableValue<T>()(log(X[i * D + label[i]]));
-  }
-}
-
-template <typename T>
-__device__ __forceinline__ T sum_single_warp(T val) {
-  val += __shfl_down(val, 16);
-  val += __shfl_down(val, 8);
-  val += __shfl_down(val, 4);
-  val += __shfl_down(val, 2);
-  val += __shfl_down(val, 1);
-  return val;
-}
-
-template <typename T>
-__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
-                                       const int class_num) {
-  int tid = threadIdx.x;
-  extern __shared__ T d_sum[];
-  d_sum[tid] = 0;
-
-  int cur_idx = tid;
-  int next_idx = blockIdx.x * class_num + tid;
-  while (cur_idx < class_num) {
-    d_sum[tid] += TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
-    next_idx += blockDim.x;
-    cur_idx += blockDim.x;
-  }
-  __syncthreads();
-
-  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
-    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
-    __syncthreads();
-  }
-
-  T val = d_sum[tid];
-  val = sum_single_warp<T>(val);
-  if (tid == 0) Y[blockIdx.x] = -val;
-}
-
+namespace {
 // TODO(qingqing): make zero setting a common function.
 template <typename T>
 __global__ void Zero(T* X, const int N) {
@@ -100,6 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
     dX[ids] = -label[ids] * dY[row_ids] / X[ids];
   }
 }
+}  // namespace
 
 template <typename T>
 class CrossEntropyOpCUDAKernel : public framework::OpKernel {
@@ -107,36 +58,13 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* label = ctx.Input<Tensor>("Label");
     Tensor* y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
 
-    const T* x_data = x->data<T>();
-    T* y_data = y->mutable_data<T>(ctx.GetPlace());
-
-    int batch_size = x->dims()[0];
-    int class_num = x->dims()[1];
-
-    if (ctx.Attr<bool>("softLabel")) {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
-
-      SoftCrossEntropyKernel<
-          T><<<batch_size, block, block * sizeof(T),
-               reinterpret_cast<const platform::CUDADeviceContext&>(
-                   ctx.device_context())
-                   .stream()>>>(y_data, x_data, label_data, class_num);
-    } else {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
-      int block = 512;
-      int grid = (batch_size + block - 1) / block;
-      CrossEntropyKernel<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(y_data, x_data, label_data,
-                                           batch_size, class_num);
-    }
+    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
+        ctx, y, x, label, ctx.Attr<bool>("softLabel"));
   }
 };
 
@@ -150,6 +78,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* label = ctx.Input<Tensor>("Label");
     Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
 
     const T* dy_data =
         ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 255b2e9f5e..1f67461d3f 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/operators/math/cross_entropy.h"
 
 namespace paddle {
 namespace operators {
@@ -25,18 +25,6 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename T>
-struct TolerableValue {
-  HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ASSERT(std::is_floating_point<T>::value);
-    const T kApproInf = 1e20;
-
-    if (x == INFINITY) return kApproInf;
-    if (x == -INFINITY) return -kApproInf;
-    return x;
-  }
-};
-
 template <typename T>
 class CrossEntropyOpKernel : public framework::OpKernel {
  public:
@@ -46,28 +34,10 @@ class CrossEntropyOpKernel : public framework::OpKernel {
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* labels = ctx.Input<Tensor>("Label");
     Tensor* y = ctx.Output<Tensor>("Y");
-    T* y_data = y->mutable_data<T>(ctx.GetPlace());
-
-    const int batch_size = x->dims()[0];
-    if (ctx.Attr<bool>("softLabel")) {
-      auto prob = EigenMatrix<T>::From(*x);
-      auto lbl_mat = EigenMatrix<T>::From(*labels);
-      auto loss = EigenMatrix<T>::From(*y);
+    y->mutable_data<T>(ctx.GetPlace());
 
-      loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
-          -((lbl_mat * prob.log().unaryExpr(TolerableValue<T>()))
-                .sum(Eigen::DSizes<int, 1>(1))
-                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
-    } else {
-      const int class_num = x->dims()[1];
-      const T* x_data = x->data<T>();
-
-      const int* label_data = labels->data<int>();
-      for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
-        y_data[i] = -TolerableValue<T>()(std::log(x_data[index]));
-      }
-    }
+    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
+        ctx, y, x, labels, ctx.Attr<bool>("softLabel"));
   }
 };
 
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 074ca47d7f..91ae3d49f1 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -3,10 +3,13 @@ if(WITH_GPU)
       im2col.cu DEPS cblas device_context operator)
     nv_library(softmax_function SRCS softmax.cc softmax.cu
       DEPS operator)
+    nv_library(cross_entropy_function SRCS cross_entropy.cc cross_entropy.cu
+      DEPS operator)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc
       DEPS cblas device_context operator)
     cc_library(softmax_function SRCS softmax.cc DEPS operator)
+    cc_library(cross_entropy_function SRCS cross_entropy.cc DEPS operator)
 endif()
 
 nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc
new file mode 100644
index 0000000000..a5a426bc7b
--- /dev/null
+++ b/paddle/operators/math/cross_entropy.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/math/cross_entropy.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class CrossEntropyFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const framework::ExecutionContext& ctx,
+                  framework::Tensor* out, const framework::Tensor* prob,
+                  const framework::Tensor* labels, const bool softLabel) {
+    const int batch_size = prob->dims()[0];
+    if (softLabel) {
+      auto in = EigenMatrix<T>::From(*prob);
+      auto lbl = EigenMatrix<T>::From(*labels);
+      auto loss = EigenMatrix<T>::From(*out);
+
+      loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
+                .sum(Eigen::DSizes<int, 1>(1))
+                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
+    } else {
+      const int class_num = prob->dims()[1];
+      const T* prob_data = prob->data<T>();
+      T* loss_data = out->data<T>();
+
+      const int* label_data = labels->data<int>();
+      for (int i = 0; i < batch_size; ++i) {
+        int index = i * class_num + label_data[i];
+        loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
+      }
+    }
+  }
+};
+
+template class CrossEntropyFunctor<platform::CPUPlace, float>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu
new file mode 100644
index 0000000000..d14a75a30c
--- /dev/null
+++ b/paddle/operators/math/cross_entropy.cu
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/math/cross_entropy.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+namespace {
+template <typename T>
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
+                                   const int N, const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
+    Y[i] = -math::TolerableValue<T>()(log(X[i * D + label[i]]));
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T sum_single_warp(T val) {
+  val += __shfl_down(val, 16);
+  val += __shfl_down(val, 8);
+  val += __shfl_down(val, 4);
+  val += __shfl_down(val, 2);
+  val += __shfl_down(val, 1);
+  return val;
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
+                                       const int class_num) {
+  int tid = threadIdx.x;
+  extern __shared__ T d_sum[];
+  d_sum[tid] = 0;
+
+  int cur_idx = tid;
+  int next_idx = blockIdx.x * class_num + tid;
+  while (cur_idx < class_num) {
+    d_sum[tid] +=
+        math::TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
+    next_idx += blockDim.x;
+    cur_idx += blockDim.x;
+  }
+  __syncthreads();
+
+  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
+    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
+    __syncthreads();
+  }
+
+  T val = d_sum[tid];
+  val = sum_single_warp<T>(val);
+  if (tid == 0) Y[blockIdx.x] = -val;
+}
+}  // namespace
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class CrossEntropyFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const framework::ExecutionContext& ctx,
+                  framework::Tensor* out, const framework::Tensor* prob,
+                  const framework::Tensor* labels, bool softLabel) {
+    const T* prob_data = prob->data<T>();
+    T* loss_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = prob->dims()[0];
+    int class_num = prob->dims()[1];
+
+    if (softLabel) {
+      const T* label_data = labels->data<T>();
+      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
+
+      SoftCrossEntropyKernel<
+          T><<<batch_size, block, block * sizeof(T),
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   ctx.device_context())
+                   .stream()>>>(loss_data, prob_data, label_data, class_num);
+    } else {
+      const int* label_data = labels->data<int>();
+      int block = 512;
+      int grid = (batch_size + block - 1) / block;
+      CrossEntropyKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(loss_data, prob_data, label_data,
+                                           batch_size, class_num);
+    }
+  }
+};
+
+template class CrossEntropyFunctor<platform::GPUPlace, float>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h
new file mode 100644
index 0000000000..18e637cf91
--- /dev/null
+++ b/paddle/operators/math/cross_entropy.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+
+template <typename Place, typename T>
+class CrossEntropyFunctor {
+ public:
+  // (TODO caoying) it is much better to use DeviceContext as the first
+  // parameter.
+  void operator()(const framework::ExecutionContext& context,
+                  framework::Tensor* out, const framework::Tensor* prob,
+                  const framework::Tensor* labels, const bool softLabel);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index feae903dab..1cf4296dcc 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -14,26 +14,14 @@
 
 #define EIGEN_USE_GPU
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/cross_entropy_op.h"
-#include "paddle/operators/math/softmax.h"
+#include "paddle/operators/softmax_with_cross_entropy_op.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T>
-__global__ void CrossEntropy(T* out, const T* softmax_out, const int* labels,
-                             const int batch_size, const int class_num) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < batch_size) {
-    PADDLE_ASSERT(labels[i] >= 0 && labels[i] < class_num);
-    out[i] =
-        -TolerableValue<T>()(std::log(softmax_out[i * class_num + labels[i]]));
-  }
-}
-
+namespace {
 template <typename T>
 __global__ void CrossEntropyGrad(T* out_grad, const T* in_grad,
                                  const int* labels, const int batch_size,
@@ -50,42 +38,6 @@ __global__ void CrossEntropyGrad(T* out_grad, const T* in_grad,
   }
 }
 
-template <typename T>
-__device__ __forceinline__ T sum_single_warp(T val) {
-  val += __shfl_down(val, 16);
-  val += __shfl_down(val, 8);
-  val += __shfl_down(val, 4);
-  val += __shfl_down(val, 2);
-  val += __shfl_down(val, 1);
-  return val;
-}
-
-template <typename T>
-__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
-                                       const int class_num) {
-  int tid = threadIdx.x;
-  extern __shared__ T d_sum[];
-  d_sum[tid] = 0;
-
-  int cur_idx = tid;
-  int next_idx = blockIdx.x * class_num + tid;
-  while (cur_idx < class_num) {
-    d_sum[tid] += TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
-    next_idx += blockDim.x;
-    cur_idx += blockDim.x;
-  }
-  __syncthreads();
-
-  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
-    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
-    __syncthreads();
-  }
-
-  T val = d_sum[tid];
-  val = sum_single_warp<T>(val);
-  if (tid == 0) Y[blockIdx.x] = -val;
-}
-
 template <typename T>
 __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
                                                const T* loss_grad,
@@ -98,6 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
     logit_grad[ids] = logit_grad[ids] * loss_grad[row_ids] - labels[ids];
   }
 }
+}  // namespace
 
 template <typename T>
 class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
@@ -105,36 +58,17 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
                    "This kernel only runs on GPU device.");
-    T* loss_data =
-        context.Output<Tensor>("Loss")->mutable_data<T>(context.GetPlace());
-
     const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* softmax = context.Output<Tensor>("Softmax");
-    T* softmax_out = softmax->mutable_data<T>(context.GetPlace());
-    math::SoftmaxFunctor<platform::GPUPlace, T>()(context, logits, softmax);
 
-    const int batch_size = logits->dims()[0];
-    const int class_num = logits->dims()[1];
-    int block = 512;
-    int grid = (batch_size + block - 1) / block;
+    Tensor* loss = context.Output<Tensor>("Loss");
+    softmax->mutable_data<T>(context.GetPlace());
+    loss->mutable_data<T>(context.GetPlace());
 
-    if (context.Attr<bool>("softLabel")) {
-      const T* label_data = context.Input<Tensor>("Label")->data<T>();
-      block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
-
-      SoftCrossEntropyKernel<
-          T><<<batch_size, block, block * sizeof(T),
-               reinterpret_cast<const platform::CUDADeviceContext&>(
-                   context.device_context())
-                   .stream()>>>(loss_data, softmax_out, label_data, class_num);
-    } else {
-      const int* label_data = context.Input<Tensor>("Label")->data<int>();
-      CrossEntropy<T><<<grid, block, 0,
-                        reinterpret_cast<const platform::CUDADeviceContext&>(
-                            context.device_context())
-                            .stream()>>>(loss_data, softmax_out, label_data,
-                                         batch_size, class_num);
-    }
+    math::SoftmaxFunctor<platform::GPUPlace, T>()(context, logits, softmax);
+    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
+        context, loss, softmax, labels, context.Attr<bool>("softLabel"));
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index 581c5145a5..bf792c1f59 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -15,7 +15,7 @@
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/operators/math/cross_entropy.h"
 #include "paddle/operators/math/softmax.h"
 
 namespace paddle {
@@ -37,31 +37,12 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
     Tensor* softmax = context.Output<Tensor>("Softmax");
     Tensor* loss = context.Output<Tensor>("Loss");
 
-    T* softmax_data = softmax->mutable_data<T>(context.GetPlace());
-    T* loss_data = loss->mutable_data<T>(context.GetPlace());
+    softmax->mutable_data<T>(context.GetPlace());
+    loss->mutable_data<T>(context.GetPlace());
 
     math::SoftmaxFunctor<platform::CPUPlace, T>()(context, logits, softmax);
-
-    const int batch_size = logits->dims()[0];
-    if (context.Attr<bool>("softLabel")) {
-      // (TODO caoying) the forward implementation can be further optimized.
-      // Current implementation is exactly cross entropy after softmax.
-      auto prob = EigenMatrix<T>::From(*softmax);
-      auto lbl_mat = EigenMatrix<T>::From(*labels);
-      auto loss_mat = EigenMatrix<T>::From(*loss);
-
-      loss_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
-          -((lbl_mat * prob.log().unaryExpr(TolerableValue<T>()))
-                .sum(Eigen::DSizes<int, 1>(1))
-                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
-    } else {
-      const int* label_data = labels->data<int>();
-      const int class_num = logits->dims()[1];
-
-      for (int i = 0; i < batch_size; ++i)
-        loss_data[i] = -TolerableValue<T>()(
-            std::log(softmax_data[i * class_num + label_data[i]]));
-    }
+    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
+        context, loss, softmax, labels, context.Attr<bool>("softLabel"));
   }
 };
 

From de350987796071e6d04de1afc4c44a88913c0c75 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 26 Sep 2017 19:52:10 -0700
Subject: [PATCH 41/51] Fix CI and follow comment

---
 paddle/pybind/protobuf.cc                     | 93 +++++++++++--------
 .../v2/framework/tests/test_protobuf_descs.py |  2 +-
 2 files changed, 54 insertions(+), 41 deletions(-)

diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 0a6020d649..de3f7bb97b 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -44,8 +44,11 @@ template <template <class...> class V, class... Ts>
 struct variant_caster<V<Ts...>> {
   using Type = V<Ts...>;
 
-  template <class T>
-  bool try_load(handle src, bool convert) {
+  template <typename T>
+  typename std::enable_if<
+      !std::is_same<T, boost::detail::variant::void_>::value,
+      bool>::type
+  try_load(handle src, bool convert) {
     auto caster = make_caster<T>();
     if (!load_success_ && caster.load(src, convert)) {
       load_success_ = true;
@@ -55,6 +58,13 @@ struct variant_caster<V<Ts...>> {
     return false;
   }
 
+  template <typename T>
+  typename std::enable_if<std::is_same<T, boost::detail::variant::void_>::value,
+                          bool>::type
+  try_load(handle src, bool convert) {
+    return false;
+  }
+
   bool load(handle src, bool convert) {
     auto unused = {false, try_load<Ts>(src, convert)...};
     (void)(unused);
@@ -210,6 +220,45 @@ public:
 
   std::string DebugString() { return this->Proto()->DebugString(); }
 
+  bool HasAttr(const std::string &name) const {
+    return attrs_.find(name) != attrs_.end();
+  }
+
+  framework::AttrType GetAttrType(const std::string &name) const {
+    auto it = attrs_.find(name);
+    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+    return static_cast<framework::AttrType>(it->second.which() - 1);
+  }
+
+  std::vector<std::string> AttrNames() const {
+    std::vector<std::string> retv;
+    retv.reserve(attrs_.size());
+    for (auto &attr : attrs_) {
+      retv.push_back(attr.first);
+    }
+    return retv;
+  }
+
+  void SetAttr(const std::string &name, const Attribute &v) {
+    this->attrs_[name] = v;
+    need_update_ = true;
+  }
+
+  void SetBlockAttr(const std::string &name, BlockDescBind &block);
+
+  Attribute GetAttr(const std::string &name) const {
+    auto it = attrs_.find(name);
+    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+    return it->second;
+  }
+
+  int GetBlockAttr(const std::string &name) const {
+    auto it = attrs_.find(name);
+    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+    return boost::get<BlockDesc *>(it->second)->idx();
+  }
+
+private:
   struct SetAttrDescVisitor : public boost::static_visitor<void> {
     explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
     mutable OpDesc::Attr *attr_;
@@ -265,49 +314,13 @@ public:
     }
   }
 
-  bool HasAttr(const std::string &name) const {
-    return attrs_.find(name) != attrs_.end();
-  }
-
-  framework::AttrType GetAttrType(const std::string &name) const {
-    auto it = attrs_.find(name);
-    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-    return static_cast<framework::AttrType>(it->second.which() - 1);
-  }
-
-  std::vector<std::string> AttrNames() const {
-    std::vector<std::string> retv;
-    retv.reserve(attrs_.size());
-    for (auto &attr : attrs_) {
-      retv.push_back(attr.first);
-    }
-    return retv;
-  }
-
-  void SetAttr(const std::string &name, const Attribute &v) {
-    this->attrs_[name] = v;
-  }
-
-  void SetBlockAttr(const std::string &name, BlockDescBind &block);
-
-  Attribute GetAttr(const std::string &name) const {
-    auto it = attrs_.find(name);
-    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-    return it->second;
-  }
-
-  int GetBlockAttr(const std::string &name) const {
-    auto it = attrs_.find(name);
-    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-    return boost::get<BlockDesc *>(it->second)->idx();
-  }
-
-private:
   OpDesc op_desc_;
   std::unordered_map<std::string, std::vector<std::string>> inputs_;
   std::unordered_map<std::string, std::vector<std::string>> outputs_;
   std::unordered_map<std::string, Attribute> attrs_;
 
+  // need_update_ indicate there some local changes not be synchronized. If
+  // local changes should be synchronized, need_update_ should be set to true.
   bool need_update_{false};
 };
 
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index 13d819abf4..2b7ba6688a 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -23,6 +23,7 @@ class TestOpDesc(unittest.TestCase):
         op.set_attr("int_attr", 1)
         self.assertEqual(1, op.attr("int_attr"))
         self.assertTrue(op.has_attr("int_attr"))
+        self.assertEqual(core.AttrType.INT, op.attr_type("int_attr"))
 
         op.set_attr("float_attr", -1.32)
         self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4)
@@ -53,7 +54,6 @@ class TestOpDesc(unittest.TestCase):
 
         op.set_block_attr("block_attr", prog.block(0))
         self.assertEqual(0, op.get_block_attr("block_attr"))
-        self.assertEqual(core.AttrType.INT, op.attr_type("int_attr"))
 
 
 class TestProgramDesc(unittest.TestCase):

From 0fa4b985a90d40cf31e06517cbda645a5987a2e9 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 26 Sep 2017 20:20:27 -0700
Subject: [PATCH 42/51] split elementwise_op.h into two header files

---
 paddle/operators/elementwise_add_op.cc     |   1 +
 paddle/operators/elementwise_add_op.h      |   2 +-
 paddle/operators/elementwise_div_op.cc     |   1 +
 paddle/operators/elementwise_div_op.h      |   2 +-
 paddle/operators/elementwise_mul_op.cc     |   1 +
 paddle/operators/elementwise_mul_op.h      |   2 +-
 paddle/operators/elementwise_op.h          | 196 +-------------------
 paddle/operators/elementwise_op_function.h | 198 +++++++++++++++++++++
 paddle/operators/elementwise_sub_op.cc     |   1 +
 paddle/operators/elementwise_sub_op.h      |   2 +-
 10 files changed, 215 insertions(+), 191 deletions(-)
 create mode 100644 paddle/operators/elementwise_op_function.h

diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
index 5f7b654d69..d9bc80c869 100644
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/operators/elementwise_add_op.h"
+#include "paddle/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index 9e9f1ffba6..e9f78ef26e 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
index c6898150d3..3f56344d00 100644
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/operators/elementwise_div_op.h"
+#include "paddle/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index 9bd7c8ea54..99b6d9c199 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index f2544b54d6..bda5dfe03e 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/operators/elementwise_mul_op.h"
+#include "paddle/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 1eaf2e3efc..6ab642378b 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -13,7 +13,7 @@
    limitations under the License. */
 
 #pragma once
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index c4777a00d6..d7013654fd 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -1,201 +1,23 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
-#include <iostream>
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
-/*
- * Out = X ⊙ Y
- * If Y's shape does not match X' shape, they will be reshaped.
- * For example:
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- *    x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5)
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- *    x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20)
- */
-inline void get_mid_dims(const framework::DDim& x_dims,
-                         const framework::DDim& y_dims, const int axis,
-                         int& pre, int& n, int& post) {
-  pre = 1;
-  n = 1;
-  post = 1;
-  for (int i = 0; i < axis; ++i) {
-    pre *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
-                      "Broadcast dimension mismatch.");
-    n *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    post *= x_dims[i];
-  }
-}
-
-#define EIGEN_FUNCTOR(name, eigen_op)                                          \
-  struct Eigen##name##Functor {                                                \
-    template <typename Place, typename T>                                      \
-    inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
-                    framework::Tensor* z,                                      \
-                    const framework::ExecutionContext& ctx) {                  \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_e);            \
-    }                                                                          \
-    template <typename Place, typename T>                                      \
-    inline void RunBroadCast(const framework::Tensor* x,                       \
-                             const framework::Tensor* y, framework::Tensor* z, \
-                             const framework::ExecutionContext& ctx, int pre,  \
-                             int n) {                                          \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
-    }                                                                          \
-    template <typename Place, typename T>                                      \
-    inline void RunBroadCast2(const framework::Tensor* x,                      \
-                              const framework::Tensor* y,                      \
-                              framework::Tensor* z,                            \
-                              const framework::ExecutionContext& ctx, int pre, \
-                              int n, int post) {                               \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
-    }                                                                          \
-  }
-
-template <class functor, typename Place, typename T>
-void ElementwiseCompute(const framework::ExecutionContext& ctx) {
-  using Tensor = framework::Tensor;
-
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* z = ctx.Output<Tensor>("Out");
-  z->mutable_data<T>(ctx.GetPlace());
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                    "Rank of first input must >= rank of second input.")
-
-  if (x_dims == y_dims || product(y_dims) == 1) {
-    functor f;
-    f.template Run<Place, T>(x, y, z, ctx);
-    return;
-  }
-
-  int axis = ctx.Attr<int>("axis");
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                 "Axis should be in range [0, x_dims)");
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-  if (post == 1) {
-    functor f;
-    f.template RunBroadCast<Place, T>(x, y, z, ctx, pre, n);
-    return;
-  } else {
-    functor f;
-    f.template RunBroadCast2<Place, T>(x, y, z, ctx, pre, n, post);
-    return;
-  }
-}
-
-#define EIGEN_ADD(x, y) ((x) + (y))
-EIGEN_FUNCTOR(Add, EIGEN_ADD);
-
-#define EIGEN_SUB(x, y) ((x) - (y))
-EIGEN_FUNCTOR(Sub, EIGEN_SUB);
-
-#define EIGEN_MUL(x, y) ((x) * (y))
-EIGEN_FUNCTOR(Mul, EIGEN_MUL);
-
-#define EIGEN_DIV(x, y) ((x) / (y))
-EIGEN_FUNCTOR(Div, EIGEN_DIV);
-
-template <typename Place, typename T, typename functor, typename functor1,
-          typename broadcastfunctor, typename broadcast2functor>
-void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
-  using Tensor = framework::Tensor;
-
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* out = ctx.Input<Tensor>("Out");
-  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-  auto place = ctx.GetEigenDevice<Place>();
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-
-  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-  if (dx) {
-    dx->mutable_data<T>(ctx.GetPlace());
-  }
-  if (dy) {
-    dy->mutable_data<T>(ctx.GetPlace());
-  }
-
-  if (x_dims == y_dims) {
-    functor f;
-    f(place, x, y, out, dx, dy, dout);
-    return;
-  }
-
-  if (product(y_dims) == 1) {
-    functor1 f;
-    f(place, x, y, out, dx, dy, dout);
-    return;
-  }
-
-  int axis = ctx.Attr<int>("axis");
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-
-  if (post == 1) {
-    broadcastfunctor f;
-    f(place, x, y, out, dx, dy, dout, pre, n);
-    return;
-  } else {
-    broadcast2functor f;
-    f(place, x, y, out, dx, dy, dout, pre, n, post);
-    return;
-  }
-}
-
 class ElementwiseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
new file mode 100644
index 0000000000..86f5fa376c
--- /dev/null
+++ b/paddle/operators/elementwise_op_function.h
@@ -0,0 +1,198 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20)
+ */
+inline void get_mid_dims(const framework::DDim& x_dims,
+                         const framework::DDim& y_dims, const int axis,
+                         int& pre, int& n, int& post) {
+  pre = 1;
+  n = 1;
+  post = 1;
+  for (int i = 0; i < axis; ++i) {
+    pre *= x_dims[i];
+  }
+
+  for (int i = 0; i < y_dims.size(); ++i) {
+    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
+                      "Broadcast dimension mismatch.");
+    n *= y_dims[i];
+  }
+
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    post *= x_dims[i];
+  }
+}
+
+#define EIGEN_FUNCTOR(name, eigen_op)                                          \
+  struct Eigen##name##Functor {                                                \
+    template <typename Place, typename T>                                      \
+    inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
+                    framework::Tensor* z,                                      \
+                    const framework::ExecutionContext& ctx) {                  \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_e);            \
+    }                                                                          \
+    template <typename Place, typename T>                                      \
+    inline void RunBroadCast(const framework::Tensor* x,                       \
+                             const framework::Tensor* y, framework::Tensor* z, \
+                             const framework::ExecutionContext& ctx, int pre,  \
+                             int n) {                                          \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+    }                                                                          \
+    template <typename Place, typename T>                                      \
+    inline void RunBroadCast2(const framework::Tensor* x,                      \
+                              const framework::Tensor* y,                      \
+                              framework::Tensor* z,                            \
+                              const framework::ExecutionContext& ctx, int pre, \
+                              int n, int post) {                               \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+    }                                                                          \
+  }
+
+template <class functor, typename Place, typename T>
+void ElementwiseCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* z = ctx.Output<Tensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                    "Rank of first input must >= rank of second input.")
+
+  if (x_dims == y_dims || product(y_dims) == 1) {
+    functor f;
+    f.template Run<Place, T>(x, y, z, ctx);
+    return;
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    functor f;
+    f.template RunBroadCast<Place, T>(x, y, z, ctx, pre, n);
+    return;
+  } else {
+    functor f;
+    f.template RunBroadCast2<Place, T>(x, y, z, ctx, pre, n, post);
+    return;
+  }
+}
+
+#define EIGEN_ADD(x, y) ((x) + (y))
+EIGEN_FUNCTOR(Add, EIGEN_ADD);
+
+#define EIGEN_SUB(x, y) ((x) - (y))
+EIGEN_FUNCTOR(Sub, EIGEN_SUB);
+
+#define EIGEN_MUL(x, y) ((x) * (y))
+EIGEN_FUNCTOR(Mul, EIGEN_MUL);
+
+#define EIGEN_DIV(x, y) ((x) / (y))
+EIGEN_FUNCTOR(Div, EIGEN_DIV);
+
+template <typename Place, typename T, typename functor, typename functor1,
+          typename broadcastfunctor, typename broadcast2functor>
+void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* out = ctx.Input<Tensor>("Out");
+  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+  auto place = ctx.GetEigenDevice<Place>();
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+
+  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+  if (dx) {
+    dx->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dy) {
+    dy->mutable_data<T>(ctx.GetPlace());
+  }
+
+  if (x_dims == y_dims) {
+    functor f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+
+  if (product(y_dims) == 1) {
+    functor1 f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+
+  if (post == 1) {
+    broadcastfunctor f;
+    f(place, x, y, out, dx, dy, dout, pre, n);
+    return;
+  } else {
+    broadcast2functor f;
+    f(place, x, y, out, dx, dy, dout, pre, n, post);
+    return;
+  }
+}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
index 31c37ff7ab..3e4f98fdb3 100644
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/operators/elementwise_sub_op.h"
+#include "paddle/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index f6bc66cd0e..3ca1376c73 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -13,7 +13,7 @@
    limitations under the License. */
 
 #pragma once
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {

From 325ee63746917a3fe8268d9934e556b9fae5339e Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 26 Sep 2017 22:29:11 -0700
Subject: [PATCH 43/51] fix SoftmaxWithCrossEntropyOp

---
 paddle/operators/math/softmax.cc              |  2 +-
 .../softmax_with_cross_entropy_op.cc          | 77 +++++++++----------
 2 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
index 1224c05810..ac9f3c4bf6 100644
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::GPUPlace, float>;
+template class SoftmaxFunctor<platform::CPUPlace, float>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index b6f33ad9e0..e2299b2544 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -82,40 +82,38 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Logits"),
-                            "Input(Logits) should be not null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) should be not null.");
-
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Softmax"),
-                            "Output(Softmax) should be not null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Loss"),
-                            "Output(Loss) should be not null.");
-
-    const Tensor* logits = ctx.Input<Tensor>("Logits");
-    const Tensor* labels = ctx.Input<Tensor>("Label");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Softmax"),
+                   "Output(Softmax) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) should be not null.");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    auto labels_dims = ctx->GetInputDim("Label");
     PADDLE_ENFORCE_EQ(
-        logits->dims().size(), 2UL,
+        logits_dims.size(), 2UL,
         "The input of softmax_with_cross_entropy should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Label")->dims().size(), 2UL,
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
                       "The labels should be a 2-D tensor.");
 
-    if (ctx.Attr<bool>("softLabel")) {
-      PADDLE_ENFORCE_EQ(logits->dims()[1], labels->dims()[1],
+    if (ctx->Attrs().Get<bool>("softLabel")) {
+      PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1],
                         "If Attr(softLabel) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
-      PADDLE_ENFORCE_EQ(labels->dims()[1], 1UL,
+      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
                         "If Attr(softLabel) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
-    ctx.Output<framework::Tensor>("Softmax")->Resize(logits->dims());
-    ctx.Output<framework::Tensor>("Loss")->Resize({logits->dims()[0], 1});
+    ctx->SetOutputDim("Softmax", logits_dims);
+    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
 
-    ctx.ShareLoD("Logits", /*->*/ "Softmax");
-    ctx.ShareLoD("Logits", /*->*/ "Loss");
+    ctx->ShareLoD("Logits", /*->*/ "Softmax");
+    ctx->ShareLoD("Logits", /*->*/ "Loss");
   }
 };
 
@@ -124,33 +122,32 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Loss")),
-                            "Input(Loss@Grad) should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Softmax"),
-                            "Input(Softmax) should be not null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) should be not null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(framework::GradVarName("Logits")),
-                            "Output(Logits@Grad) should be not null.");
-
-    const Tensor* softmax = ctx.Input<Tensor>("Softmax");
-    const Tensor* labels = ctx.Input<Tensor>("Label");
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Label")->dims().size(), 2UL,
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@Grad) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Softmax"),
+                   "Input(Softmax) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Output(Logits@Grad) should be not null.");
+
+    auto softmax_dims = ctx->GetInputDim("Softmax");
+    auto labels_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
                       "The labels should be a 2-D tensor.");
 
-    if (ctx.Attr<bool>("softLabel")) {
-      PADDLE_ENFORCE_EQ(softmax->dims()[1], labels->dims()[1],
+    if (ctx->Attrs().Get<bool>("softLabel")) {
+      PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1],
                         "When Attr(softLabel) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
-      PADDLE_ENFORCE_EQ(labels->dims()[1], 1UL,
+      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
                         "When Attr(softLabel) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("Logits"))
-        ->Resize(ctx.Input<Tensor>("Softmax")->dims());
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Softmax"));
   }
 };
 

From daa6ac9b6b8be5a8d743d6771c29207b97801508 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 27 Sep 2017 14:39:27 +0800
Subject: [PATCH 44/51] refine the catalog of FAQ

---
 doc/faq/build_and_install/index_cn.rst        | 111 ++++
 doc/faq/cluster/index_cn.rst                  |  17 +
 doc/faq/index_cn.rst                          | 597 +-----------------
 doc/faq/local/index_cn.rst                    | 213 +++++++
 .../{ => local}/src/reduce_min_pool_size.py   |   0
 doc/faq/{ => local}/src/word2vec_config.py    |   0
 .../{ => local}/src/word2vec_dataprovider.py  |   0
 doc/faq/model/index_cn.rst                    |  69 ++
 doc/faq/parameter/index_cn.rst                | 201 ++++++
 9 files changed, 619 insertions(+), 589 deletions(-)
 create mode 100644 doc/faq/build_and_install/index_cn.rst
 create mode 100644 doc/faq/cluster/index_cn.rst
 create mode 100644 doc/faq/local/index_cn.rst
 rename doc/faq/{ => local}/src/reduce_min_pool_size.py (100%)
 rename doc/faq/{ => local}/src/word2vec_config.py (100%)
 rename doc/faq/{ => local}/src/word2vec_dataprovider.py (100%)
 create mode 100644 doc/faq/model/index_cn.rst
 create mode 100644 doc/faq/parameter/index_cn.rst

diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst
new file mode 100644
index 0000000000..f1677e216f
--- /dev/null
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -0,0 +1,111 @@
+###################
+编译安装与单元测试
+###################
+
+..  contents::
+
+1. 运行Docker GPU镜像出现 "CUDA driver version is insufficient"
+----------------------------------------------------------------
+
+用户在使用PaddlePaddle GPU的Docker镜像的时候，常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。
+具体的解决方法是：
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+
+更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
+
+
+2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
+----------------------------------------------------------------
+
+这是目前CMake寻找Python的逻辑存在缺陷，如果系统安装了多个Python版本，CMake找到的Python库和Python解释器版本可能有不一致现象，导致编译PaddlePaddle失败。正确的解决方法是，
+用户强制指定特定的Python版本，具体操作如下：
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
+
+3. CMake源码编译，Paddle版本号为0.0.0
+--------------------------------------
+
+如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`；或者运行 :code:`cmake ..`，出现
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+那么用户需要拉取所有的远程分支到本机，命令为 :code:`git fetch upstream`，然后重新cmake即可。
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统，并安装了python 2.7和pip 9.0.1。
+
+更新 :code:`pip` 包的方法是\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+如果还不行，可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀，
+并对比是否和正在安装的后缀一致。
+
+如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ，需要升级pip版本到最新；
+如果系统支持 :code:`manylinux1_x86_64` 而安装包（本地）是 :code:`linux_x86_64` ，可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。
+
+5. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
+------------------------------------------------------------------------------------------
+先查看一下是否曾经安装过paddle v1版本，有的话需要先卸载：
+
+pip uninstall py_paddle paddle
+
+然后安装paddle的python环境, 在build目录下执行
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. 遇到“非法指令”或者是“illegal instruction”
+--------------------------------------------
+
+PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
+
+7.  python相关的单元测试都过不了
+--------------------------------
+
+如果出现以下python相关的单元测试都过不了的情况：
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+并且查询PaddlePaddle单元测试的日志，提示：
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+解决办法是：
+
+* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
diff --git a/doc/faq/cluster/index_cn.rst b/doc/faq/cluster/index_cn.rst
new file mode 100644
index 0000000000..e59c1e1a54
--- /dev/null
+++ b/doc/faq/cluster/index_cn.rst
@@ -0,0 +1,17 @@
+###############
+集群训练与预测
+###############
+
+..  contents::
+
+1. 集群多节点训练，日志中保存均为网络通信类错误
+------------------------------------------------
+
+集群多节点训练，日志报错为网络通信类错误，比如 :code:`Connection reset by peer` 等。
+此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出，从而引发其他节点无法连接导致，可以参考下面的步骤排查：
+
+* 从 :code:`train.log` ， :code:`server.log` 找到最早报错的地方，查看是否是其他错误引发的报错（比如FPE，内存不足，磁盘空间不足等）。
+
+* 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
+
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index d69d111917..9929767cac 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -1,592 +1,11 @@
-####################
 FAQ
-####################
+====
 
-..  contents::
+..  toctree::
+  :maxdepth: 1
 
-1. 如何减少内存占用
----------------------------------
-
-神经网络的训练本身是一个非常消耗内存和显存的工作，经常会消耗数10GB的内存和数GB的显存。
-PaddlePaddle的内存占用主要分为如下几个方面\:
-
-* DataProvider缓冲池内存（只针对内存）
-* 神经元激活内存（针对内存和显存）
-* 参数内存 （针对内存和显存）
-* 其他内存杂项
-
-其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，暂不考虑在内。
-
-减少DataProvider缓冲池内存
-++++++++++++++++++++++++++
-
-PyDataProvider使用的是异步加载，同时在内存里直接随即选取数据来做Shuffle。即
-
-..  graphviz::
-
-    digraph {
-        rankdir=LR;
-        数据文件 -> 内存池 -> PaddlePaddle训练
-    }
-
-所以，减小这个内存池即可减小内存占用，同时也可以加速开始训练前数据载入的过程。但是，这
-个内存池实际上决定了shuffle的粒度。所以，如果将这个内存池减小，又要保证数据是随机的，
-那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
-
-..  literalinclude:: src/reduce_min_pool_size.py
-
-这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 :ref:`api_pydataprovider2` 。
-
-神经元激活内存
-++++++++++++++
-
-神经网络在训练的时候，会对每一个激活暂存一些数据，如神经元激活值等。
-在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
-一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
-的时间步信息成正比。
-
-所以做法可以有两种：
-
-* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
-* 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
-  但是突然有一个10000长的序列，就很容易导致内存超限，特别是在LSTM等RNN中。
-
-参数内存
-++++++++
-
-PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
-例如使用 :code:`adadelta` 算法，则需要使用等于权重参数规模大约5倍的内存。举例，如果参数保存下来的模型目录
-文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
-
-可以考虑使用一些优化算法，例如 :code:`momentum`。
-
-2. 如何加速PaddlePaddle的训练速度
----------------------------------
-
-加速PaddlePaddle训练可以考虑从以下几个方面\：
-
-* 减少数据载入的耗时
-* 加速训练速度
-* 利用分布式训练驾驭更多的计算资源
-
-减少数据载入的耗时
-++++++++++++++++++
-
-使用\ :code:`pydataprovider`\ 时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
-:code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
-
-..  literalinclude:: src/reduce_min_pool_size.py
-
-同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法，将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话，会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里，在之后的 :code:`pass` 中，不会再从 :code:`python` 端读取数据，而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
-
-
-加速训练速度
-++++++++++++
-
-PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时，与这个训练数据交互的Layer，需要将其Parameter设置成 sparse 更新模式，即设置 :code:`sparse_update=True`
-
-这里使用简单的 :code:`word2vec` 训练语言模型距离，具体使用方法为\:
-
-使用一个词前两个词和后两个词，来预测这个中间的词。这个任务的DataProvider为\:
-
-..  literalinclude:: src/word2vec_dataprovider.py
-
-这个任务的配置为\:
-
-..  literalinclude:: src/word2vec_config.py
-
-
-利用更多的计算资源
-++++++++++++++++++
-
-利用更多的计算资源可以分为一下几个方式来进行\:
-
-* 单机CPU训练
-
-  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
-
-* 单机GPU训练
-
-  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
-  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
-
-* 多机训练
-
-  * 请参考 :ref:`cluster_train` 。
-
-
-3. 遇到“非法指令”或者是“illegal instruction”
---------------------------------------------
-
-PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
-
-4. 如何选择SGD算法的学习率
---------------------------
-
-在采用sgd/async_sgd进行训练时，一个重要的问题是选择正确的learning_rate。如果learning_rate太大，那么训练有可能不收敛，如果learning_rate太小，那么收敛可能很慢，导致训练时间过长。
-
-通常做法是从一个比较大的learning_rate开始试，如果不收敛，那减少学习率10倍继续试验，直到训练收敛为止。那么如何判断训练不收敛呢？可以估计出如果模型采用不变的输出最小的cost0是多少。
-
-如果训练过程的的cost明显高于这个常数输出的cost，那么我们可以判断为训练不收敛。举一个例子，假如我们是三分类问题，采用multi-class-cross-entropy作为cost，数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass（或者更早）后，cost还大于这个数，那么可以认为训练不收敛，应该降低学习率。
-
-
-5. 如何初始化参数
------------------
-
-默认情况下，PaddlePaddle使用均值0，标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式，PaddlePaddle目前提供两种参数初始化的方式\:
-
-* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
-* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
-
-比如设置一个全连接层的参数初始化方式和bias初始化方式，可以使用如下代码。
-
-..  code-block:: python
-
-    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
-                      bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
-
-上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
-
-6. 如何共享参数
----------------
-
-PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
-
-简单的全连接网络，参数共享的配置示例为\:
-
-..  literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
-
-这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
-
-7. paddlepaddle\*.whl is not a supported wheel on this platform.
-------------------------------------------------------------------------
-
-出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统，并安装了python 2.7和pip 9.0.1。
-
-更新 :code:`pip` 包的方法是\:
-
-..  code-block:: bash
-
-    pip install --upgrade pip
-
-如果还不行，可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀，
-并对比是否和正在安装的后缀一致。
-
-如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ，需要升级pip版本到最新；
-如果系统支持 :code:`manylinux1_x86_64` 而安装包（本地）是 :code:`linux_x86_64` ，可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。
-
-8.  python相关的单元测试都过不了
---------------------------------
-
-如果出现以下python相关的单元测试都过不了的情况：
-
-..  code-block:: bash
-
-    24 - test_PyDataProvider (Failed)
-    26 - test_RecurrentGradientMachine (Failed)
-    27 - test_NetworkCompare (Failed)
-    28 - test_PyDataProvider2 (Failed)
-    32 - test_Prediction (Failed)
-    33 - test_Compare (Failed)
-    34 - test_Trainer (Failed)
-    35 - test_TrainerOnePass (Failed)
-    36 - test_CompareTwoNets (Failed)
-    37 - test_CompareTwoOpts (Failed)
-    38 - test_CompareSparse (Failed)
-    39 - test_recurrent_machine_generation (Failed)
-    40 - test_PyDataProviderWrapper (Failed)
-    41 - test_config_parser (Failed)
-    42 - test_swig_api (Failed)
-    43 - layers_test (Failed)
-
-并且查询PaddlePaddle单元测试的日志，提示：
-
-..  code-block:: bash
-
-    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
-    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
-
-解决办法是：
-
-* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
-
-
-9. 运行Docker GPU镜像出现 "CUDA driver version is insufficient"
-----------------------------------------------------------------
-
-用户在使用PaddlePaddle GPU的Docker镜像的时候，常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。
-具体的解决方法是：
-
-..  code-block:: bash
-
-    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
-
-更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
-
-
-10. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
-----------------------------------------------------------------
-
-这是目前CMake寻找Python的逻辑存在缺陷，如果系统安装了多个Python版本，CMake找到的Python库和Python解释器版本可能有不一致现象，导致编译PaddlePaddle失败。正确的解决方法是，
-用户强制指定特定的Python版本，具体操作如下：
-
-    ..  code-block:: bash
-
-        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
-
-用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
-
-11. CMake源码编译，Paddle版本号为0.0.0
---------------------------------------
-
-如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`；或者运行 :code:`cmake ..`，出现
-
-..  code-block:: bash
-
-    CMake Warning at cmake/version.cmake:20 (message):
-      Cannot add paddle version from git tag
-
-那么用户需要拉取所有的远程分支到本机，命令为 :code:`git fetch upstream`，然后重新cmake即可。
-
-12. A protocol message was rejected because it was too big
-------------------------------------------------------------
-
-如果在训练NLP相关模型时，出现以下错误：
-
-..  code-block:: bash
-
-    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
-    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
-
-可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
-
-..  code-block:: python
-
-     src_dict = dict()
-     for line_count, line in enumerate(open(src_dict_path, "r")):
-        src_dict[line.strip()] = line_count
-
-     define_py_data_sources2(
-        train_list,
-        test_list,
-        module="dataprovider",
-        obj="process",
-        args={"src_dict": src_dict})
-
-解决方案是：将字典的地址作为args传给dataprovider，然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为：
-
-..  code-block:: python
-
-     define_py_data_sources2(
-        train_list,
-        test_list,
-        module="dataprovider",
-        obj="process",
-        args={"src_dict_path": src_dict_path})
-
-完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
-
-13. 如何指定GPU设备
--------------------
-
-例如机器上有4块GPU，编号从0开始，指定使用2、3号GPU：
-
-* 方式1：通过 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 环境变量来指定特定的GPU。
-
-..      code-block:: bash
-
-        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
-
-* 方式2：通过命令行参数 ``--gpu_id`` 指定。
-
-..      code-block:: bash
-
-        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
-
-
-14. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
-------------------------------------------------------------------------
-
-Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异常(即训练过程中出现NaN或者Inf)，立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
-主要原因包括两个方面:
-
-* 训练过程中参数或者训练过程中的梯度尺度过大，导致参数累加，乘除等时候，导致了浮点数溢出。
-* 模型一直不收敛，发散到了一个数值特别大的地方。
-* 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
-
-这里有两种有效的解决方法：
-
-1. 设置 :code:`gradient_clipping_threshold` 参数，示例代码如下：
-
-..  code-block:: python
-
-optimizer = paddle.optimizer.RMSProp(
-    learning_rate=1e-3,
-    gradient_clipping_threshold=10.0,
-    regularization=paddle.optimizer.L2Regularization(rate=8e-4))
-
-具体可以参考  `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
-
-2. 设置 :code:`error_clipping_threshold` 参数，示例代码如下：
-
-..  code-block:: python
-
-decoder_inputs = paddle.layer.fc(
-    act=paddle.activation.Linear(),
-    size=decoder_size * 3,
-    bias_attr=False,
-    input=[context, current_word],
-    layer_attr=paddle.attr.ExtraLayerAttribute(
-        error_clipping_threshold=100.0))
-
-完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
-
-两种方法的区别：
-
-1. 两者都是对梯度的截断，但截断时机不同，前者在 :code:`optimzier` 更新网络参数时应用；后者在激活函数反向计算时被调用；
-2. 截断对象不同：前者截断可学习参数的梯度，后者截断回传给前层的梯度;
-
-除此之外，还可以通过减小学习律或者对数据进行归一化处理来解决这类问题。
-
-15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
-------------------------------------------------------------------------------------------
-先查看一下是否曾经安装过paddle v1版本，有的话需要先卸载：
-
-pip uninstall py_paddle paddle
-
-然后安装paddle的python环境, 在build目录下执行
-
-pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
-
-16. PaddlePaddle存储的参数格式是什么，如何和明文进行相互转化
----------------------------------------------------------------------
-
-PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中，1~4字节表示PaddlePaddle版本信息，请直接填充0；5~8字节表示每个参数占用的字节数，当保存的网络参数为float类型时为4，double类型时为8；9~16字节表示保存的参数总个数。
-
-将PaddlePaddle保存的模型参数还原回明文时，可以使用相应数据类型的 :code:`numpy.array` 加载具体网络参数，此时可以跳过PaddlePaddle模型参数文件的头信息。若在PaddlePaddle编译时，未指定按照double精度编译，默认情况下按照float精度计算，保存的参数也是float类型。这时在使用 :code:`numpy.array` 时，一般设置 :code:`dtype=float32` 。示例如下：
-
-..  code-block:: python
-
-    def read_parameter(fname, width):
-        s = open(fname).read()
-        # skip header
-        vec = np.fromstring(s[16:], dtype=np.float32)
-        # width is the size of the corresponding layer
-        np.savetxt(fname + ".csv", vec.reshape(width, -1),
-                fmt="%.6f", delimiter=",")
-
-
-将明文参数转化为PaddlePaddle可加载的模型参数时，首先构造头信息，再写入网络参数。下面的代码将随机生成的矩阵转化为可以被PaddlePaddle加载的模型参数。
-
-..  code-block:: python
-
-    def gen_rand_param(param_file, width, height, need_trans):
-        np.random.seed()
-        header = struct.pack("iil", 0, 4, height * width)
-        param = np.float32(np.random.rand(height, width))
-        with open(param_file, "w") as fparam:
-            fparam.write(header + param.tostring())
-
-17. 如何加载预训练参数
-------------------------------
-
-* 对加载预训练参数的层，设置其参数属性 :code:`is_static=True`，使该层的参数在训练过程中保持不变。以embedding层为例，代码如下：
-
-..  code-block:: python
-
-    emb_para = paddle.attr.Param(name='emb', is_static=True)
-    paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
-
-
-* 从模型文件将预训练参数载入 :code:`numpy.array`，在创建parameters后，使用 :code:`parameters.set()` 加载预训练参数。PaddlePaddle保存的模型参数文件前16字节为头信息，用户将参数载入 :code:`numpy.array` 时须从第17字节开始。以embedding层为例，代码如下：
-
-..  code-block:: python
-
-    def load_parameter(file_name, h, w):
-        with open(file_name, 'rb') as f:
-            f.read(16)  # skip header.
-            return np.fromfile(f, dtype=np.float32).reshape(h, w)
-
-    parameters = paddle.parameters.create(my_cost)
-    parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
-
-18. 集群多节点训练，日志中保存均为网络通信类错误
------------------------------------------------------------
-
-集群多节点训练，日志报错为网络通信类错误，比如 :code:`Connection reset by peer` 等。
-此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出，从而引发其他节点无法连接导致，可以参考下面的步骤排查：
-
-* 从 :code:`train.log` ， :code:`server.log` 找到最早报错的地方，查看是否是其他错误引发的报错（比如FPE，内存不足，磁盘空间不足等）。
-
-* 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
-
-* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
-
-19.  如何调用 infer 接口输出多个layer的预测结果
------------------------------------------------------------
-
-* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入，代码如下：
-
-..  code-block:: python
-
-    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
-
-* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例，代码如下：
-
-..  code-block:: python
-
-    out = inferer.infer(input=data_batch, field=["value"])
-
-需要注意的是：
-
-* 如果指定了2个layer作为输出层，实际上需要的输出结果是两个矩阵；
-* 假设第一个layer的输出A是一个 N1 * M1 的矩阵，第二个 Layer 的输出B是一个 N2 * M2 的矩阵；
-* paddle.v2 默认会将A和B 横向拼接，当N1 和 N2 大小不一样时，会报如下的错误：
-
-..      code-block:: python
-
-    ValueError: all the input array dimensions except for the concatenation axis must match exactly
-
-多个层的输出矩阵的高度不一致导致拼接失败，这种情况常常发生在：
-
-* 同时输出序列层和非序列层；
-* 多个输出层处理多个不同长度的序列;
-
-此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤，来解决上面的问题。这时，infer接口的返回值是一个python list:
-
-* list 中元素的个数等于网络中输出层的个数；
-* list 中每个元素是一个layer的输出结果矩阵，类型是numpy的ndarray；
-* 每一个layer输出矩阵的高度，在非序列输入时：等于样本数；序列输入时等于：输入序列中元素的总数；宽度等于配置中layer的size；
-
-20. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
--------------------------------------------------------------
-
-* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出，该layer是通过参数 :code:`name` 指定，即，:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer，并将该layer上一时间步的输出作为自身当前时间步的输出。
-
-* PaddlePaddle的所有layer都有唯一的name，用户通过参数 :code:`name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer，其name由参数 :code:`memory_name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer，需要用户显式设定。
-
-21. 两种使用 drop_out 的方法有何区别？
------------------------------------------------------
-
-* 在PaddlePaddle中使用dropout有两种方式
-
-  * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`，以 :code:`paddle.layer.fc` 为例，代码如下：
-
-  ..  code-block:: python
-
-      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
-
-  * 使用 :code:`paddle.layer.dropout`，以 :code:`paddle.layer.fc` 为例，代码如下：
-
-  ..  code-block:: python
-
-      fc = paddle.layer.fc(input=input)
-      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
-
-* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`，并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
-
-* PaddlePaddle在激活函数里实现dropout，而不是在layer里实现。
-
-* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活，所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout，可采用第二种方式，即使用 :code:`paddle.layer.dropout`。
-
-22. 如何设置学习率退火（learning rate annealing）
-------------------------------------------------
-
-在相应的优化算法里设置learning_rate_schedule及相关参数，以使用Adam算法为例，代码如下：
-
-..  code-block:: python
-
-    optimizer = paddle.optimizer.Adam(
-        learning_rate=1e-3,
-        learning_rate_decay_a=0.5,
-        learning_rate_decay_b=0.75,
-        learning_rate_schedule="poly",)
-
-PaddlePaddle目前支持8种learning_rate_schedule，这8种learning_rate_schedule及其对应学习率计算方式如下：
-
-* "constant"
-
-  lr = learning_rate
-
-* "poly"
-
-  lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
-
-  其中，num_samples_processed为已训练样本数，下同。
-
-* "caffe_poly"
-
-  lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
-
-* "exp"
-
-  lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
-
-* "discexp"
-
-  lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
-
-* "linear"
-
-  lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
-
-* "manual"
-
-  这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
-
-  ..  code-block:: python
-
-      optimizer = paddle.optimizer.Adam(
-          learning_rate=1e-3,
-          learning_rate_schedule="manual",
-          learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
-
-  在该示例中，当已训练样本数小于等于1000时，学习率为 :code:`1e-3 * 1.0`；当已训练样本数大于1000小于等于2000时，学习率为 :code:`1e-3 * 0.9`；当已训练样本数大于2000时，学习率为 :code:`1e-3 * 0.8`。
-
-* "pass_manual"
-
-  这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
-
-  ..  code-block:: python
-
-      optimizer = paddle.optimizer.Adam(
-          learning_rate=1e-3,
-          learning_rate_schedule="manual",
-          learning_rate_args="1:1.0,2:0.9,3:0.8",)
-
-  在该示例中，当已训练pass数小于等于1时，学习率为 :code:`1e-3 * 1.0`；当已训练pass数大于1小于等于2时，学习率为 :code:`1e-3 * 0.9`；当已训练pass数大于2时，学习率为 :code:`1e-3 * 0.8`。
-
-23. 出现 :code:`Duplicated layer name` 错误怎么办
---------------------------------------------------
-
-出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时，先找出参数 :code:`name` 取值相同的layer，然后将这些layer的参数 :code:`name` 设置为不同的值。
-
-24. PaddlePaddle 中不同的 recurrent layer 的区别
---------------------------------------------------
-以LSTM为例，在PaddlePaddle中包含以下 recurrent layer：
-
-* :code:`paddle.layer.lstmemory`
-* :code:`paddle.networks.simple_lstm`
-* :code:`paddle.networks.lstmemory_group`
-* :code:`paddle.networks.bidirectional_lstm`
-
-按照具体实现方式可以归纳为2类：
-
-1. 由 recurrent_group 实现的 recurrent layer：
-
-  * 用户在使用这一类recurrent layer时，可以访问由recurrent unit在一个时间步内计算得到的中间值（例如：hidden states, memory cells等）；
-  * 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer ；
-
-2. 将recurrent layer作为一个整体来实现：
-
-  * 用户在使用这一类recurrent layer，只能访问它们的输出值；
-  * 上述的 :code:`paddle.networks.lstmemory_group` 、 :code:`paddle.networks.simple_lstm` 和 :code:`paddle.networks.bidirectional_lstm` 属于这一类的实现；
-
-将recurrent layer作为一个整体来实现， 能够针对CPU和GPU的计算做更多优化， 所以相比于recurrent group的实现方式， 第二类 recurrent layer 计算效率更高。 在实际应用中，如果用户不需要访问LSTM的中间变量，而只需要获得recurrent layer计算的输出，我们建议使用第二类实现。
-
-此外，关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元：
-
-  * 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程，它并不是一个完整的recurrent layer，也不能接收序列数据作为输入；
-  * :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用；
+  build_and_install/index_cn.rst
+  model/index_cn.rst
+  parameter/index_cn.rst
+  local/index_cn.rst
+  cluster/index_cn.rst
diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
new file mode 100644
index 0000000000..75c4ba028e
--- /dev/null
+++ b/doc/faq/local/index_cn.rst
@@ -0,0 +1,213 @@
+###############
+本地训练与预测
+###############
+
+..  contents::
+
+1. 如何减少内存占用
+-------------------
+
+神经网络的训练本身是一个非常消耗内存和显存的工作，经常会消耗数10GB的内存和数GB的显存。
+PaddlePaddle的内存占用主要分为如下几个方面\:
+
+* DataProvider缓冲池内存（只针对内存）
+* 神经元激活内存（针对内存和显存）
+* 参数内存 （针对内存和显存）
+* 其他内存杂项
+
+其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，暂不考虑在内。
+
+减少DataProvider缓冲池内存
+++++++++++++++++++++++++++
+
+PyDataProvider使用的是异步加载，同时在内存里直接随即选取数据来做Shuffle。即
+
+..  graphviz::
+
+    digraph {
+        rankdir=LR;
+        数据文件 -> 内存池 -> PaddlePaddle训练
+    }
+
+所以，减小这个内存池即可减小内存占用，同时也可以加速开始训练前数据载入的过程。但是，这
+个内存池实际上决定了shuffle的粒度。所以，如果将这个内存池减小，又要保证数据是随机的，
+那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 :ref:`api_pydataprovider2` 。
+
+神经元激活内存
+++++++++++++++
+
+神经网络在训练的时候，会对每一个激活暂存一些数据，如神经元激活值等。
+在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
+一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
+的时间步信息成正比。
+
+所以做法可以有两种：
+
+* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
+* 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
+  但是突然有一个10000长的序列，就很容易导致内存超限，特别是在LSTM等RNN中。
+
+参数内存
+++++++++
+
+PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
+例如使用 :code:`adadelta` 算法，则需要使用等于权重参数规模大约5倍的内存。举例，如果参数保存下来的模型目录
+文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
+
+可以考虑使用一些优化算法，例如 :code:`momentum`。
+
+2. 如何加速训练速度
+-------------------
+
+加速PaddlePaddle训练可以考虑从以下几个方面\：
+
+* 减少数据载入的耗时
+* 加速训练速度
+* 利用分布式训练驾驭更多的计算资源
+
+减少数据载入的耗时
+++++++++++++++++++
+
+使用\ :code:`pydataprovider`\ 时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+:code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法，将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话，会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里，在之后的 :code:`pass` 中，不会再从 :code:`python` 端读取数据，而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
+
+
+加速训练速度
+++++++++++++
+
+PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时，与这个训练数据交互的Layer，需要将其Parameter设置成 sparse 更新模式，即设置 :code:`sparse_update=True`
+
+这里使用简单的 :code:`word2vec` 训练语言模型距离，具体使用方法为\:
+
+使用一个词前两个词和后两个词，来预测这个中间的词。这个任务的DataProvider为\:
+
+..  literalinclude:: src/word2vec_dataprovider.py
+
+这个任务的配置为\:
+
+..  literalinclude:: src/word2vec_config.py
+
+
+利用更多的计算资源
+++++++++++++++++++
+
+利用更多的计算资源可以分为一下几个方式来进行\:
+
+* 单机CPU训练
+
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
+
+* 单机GPU训练
+
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
+
+* 多机训练
+
+  * 请参考 :ref:`cluster_train` 。
+
+3. 如何指定GPU设备
+------------------
+
+例如机器上有4块GPU，编号从0开始，指定使用2、3号GPU：
+
+* 方式1：通过 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 环境变量来指定特定的GPU。
+
+..      code-block:: bash
+
+        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
+
+* 方式2：通过命令行参数 ``--gpu_id`` 指定。
+
+..      code-block:: bash
+
+        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+4. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
+------------------------------------------------------------------------
+
+Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异常(即训练过程中出现NaN或者Inf)，立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
+主要原因包括两个方面:
+
+* 训练过程中参数或者训练过程中的梯度尺度过大，导致参数累加，乘除等时候，导致了浮点数溢出。
+* 模型一直不收敛，发散到了一个数值特别大的地方。
+* 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
+
+这里有两种有效的解决方法：
+
+1. 设置 :code:`gradient_clipping_threshold` 参数，示例代码如下：
+
+..  code-block:: python
+
+optimizer = paddle.optimizer.RMSProp(
+    learning_rate=1e-3,
+    gradient_clipping_threshold=10.0,
+    regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+具体可以参考  `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
+
+2. 设置 :code:`error_clipping_threshold` 参数，示例代码如下：
+
+..  code-block:: python
+
+decoder_inputs = paddle.layer.fc(
+    act=paddle.activation.Linear(),
+    size=decoder_size * 3,
+    bias_attr=False,
+    input=[context, current_word],
+    layer_attr=paddle.attr.ExtraLayerAttribute(
+        error_clipping_threshold=100.0))
+
+完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
+
+两种方法的区别：
+
+1. 两者都是对梯度的截断，但截断时机不同，前者在 :code:`optimzier` 更新网络参数时应用；后者在激活函数反向计算时被调用；
+2. 截断对象不同：前者截断可学习参数的梯度，后者截断回传给前层的梯度;
+
+除此之外，还可以通过减小学习律或者对数据进行归一化处理来解决这类问题。
+
+5.  如何调用 infer 接口输出多个layer的预测结果
+-----------------------------------------------
+
+* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入，代码如下：
+
+..  code-block:: python
+
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+
+* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例，代码如下：
+
+..  code-block:: python
+
+    out = inferer.infer(input=data_batch, field=["value"])
+
+需要注意的是：
+
+* 如果指定了2个layer作为输出层，实际上需要的输出结果是两个矩阵；
+* 假设第一个layer的输出A是一个 N1 * M1 的矩阵，第二个 Layer 的输出B是一个 N2 * M2 的矩阵；
+* paddle.v2 默认会将A和B 横向拼接，当N1 和 N2 大小不一样时，会报如下的错误：
+
+..      code-block:: python
+
+    ValueError: all the input array dimensions except for the concatenation axis must match exactly
+
+多个层的输出矩阵的高度不一致导致拼接失败，这种情况常常发生在：
+
+* 同时输出序列层和非序列层；
+* 多个输出层处理多个不同长度的序列;
+
+此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤，来解决上面的问题。这时，infer接口的返回值是一个python list:
+
+* list 中元素的个数等于网络中输出层的个数；
+* list 中每个元素是一个layer的输出结果矩阵，类型是numpy的ndarray；
+* 每一个layer输出矩阵的高度，在非序列输入时：等于样本数；序列输入时等于：输入序列中元素的总数；宽度等于配置中layer的size；
diff --git a/doc/faq/src/reduce_min_pool_size.py b/doc/faq/local/src/reduce_min_pool_size.py
similarity index 100%
rename from doc/faq/src/reduce_min_pool_size.py
rename to doc/faq/local/src/reduce_min_pool_size.py
diff --git a/doc/faq/src/word2vec_config.py b/doc/faq/local/src/word2vec_config.py
similarity index 100%
rename from doc/faq/src/word2vec_config.py
rename to doc/faq/local/src/word2vec_config.py
diff --git a/doc/faq/src/word2vec_dataprovider.py b/doc/faq/local/src/word2vec_dataprovider.py
similarity index 100%
rename from doc/faq/src/word2vec_dataprovider.py
rename to doc/faq/local/src/word2vec_dataprovider.py
diff --git a/doc/faq/model/index_cn.rst b/doc/faq/model/index_cn.rst
new file mode 100644
index 0000000000..b47bbe05bd
--- /dev/null
+++ b/doc/faq/model/index_cn.rst
@@ -0,0 +1,69 @@
+#########
+模型配置
+#########
+
+..  contents::
+
+1. 出现 :code:`Duplicated layer name` 错误怎么办
+--------------------------------------------------
+
+出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时，先找出参数 :code:`name` 取值相同的layer，然后将这些layer的参数 :code:`name` 设置为不同的值。
+
+2. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
+-------------------------------------------------------------
+
+* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出，该layer是通过参数 :code:`name` 指定，即，:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer，并将该layer上一时间步的输出作为自身当前时间步的输出。
+
+* PaddlePaddle的所有layer都有唯一的name，用户通过参数 :code:`name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer，其name由参数 :code:`memory_name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer，需要用户显式设定。
+
+3. 两种使用 drop_out 的方法有何区别
+------------------------------------
+
+* 在PaddlePaddle中使用dropout有两种方式
+
+  * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`，以 :code:`paddle.layer.fc` 为例，代码如下：
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * 使用 :code:`paddle.layer.dropout`，以 :code:`paddle.layer.fc` 为例，代码如下：
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`，并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
+
+* PaddlePaddle在激活函数里实现dropout，而不是在layer里实现。
+
+* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活，所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout，可采用第二种方式，即使用 :code:`paddle.layer.dropout`。
+
+4. 不同的 recurrent layer 的区别
+----------------------------------
+以LSTM为例，在PaddlePaddle中包含以下 recurrent layer：
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+按照具体实现方式可以归纳为2类：
+
+1. 由 recurrent_group 实现的 recurrent layer：
+
+  * 用户在使用这一类recurrent layer时，可以访问由recurrent unit在一个时间步内计算得到的中间值（例如：hidden states, memory cells等）；
+  * 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer ；
+
+2. 将recurrent layer作为一个整体来实现：
+
+  * 用户在使用这一类recurrent layer，只能访问它们的输出值；
+  * 上述的 :code:`paddle.networks.lstmemory_group` 、 :code:`paddle.networks.simple_lstm` 和 :code:`paddle.networks.bidirectional_lstm` 属于这一类的实现；
+
+将recurrent layer作为一个整体来实现， 能够针对CPU和GPU的计算做更多优化， 所以相比于recurrent group的实现方式， 第二类 recurrent layer 计算效率更高。 在实际应用中，如果用户不需要访问LSTM的中间变量，而只需要获得recurrent layer计算的输出，我们建议使用第二类实现。
+
+此外，关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元：
+
+  * 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程，它并不是一个完整的recurrent layer，也不能接收序列数据作为输入；
+  * :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用；
diff --git a/doc/faq/parameter/index_cn.rst b/doc/faq/parameter/index_cn.rst
new file mode 100644
index 0000000000..c721b62318
--- /dev/null
+++ b/doc/faq/parameter/index_cn.rst
@@ -0,0 +1,201 @@
+#########
+参数设置
+#########
+
+..  contents::
+
+1. 如何选择SGD算法的学习率
+--------------------------
+
+在采用sgd/async_sgd进行训练时，一个重要的问题是选择正确的learning_rate。如果learning_rate太大，那么训练有可能不收敛，如果learning_rate太小，那么收敛可能很慢，导致训练时间过长。
+
+通常做法是从一个比较大的learning_rate开始试，如果不收敛，那减少学习率10倍继续试验，直到训练收敛为止。那么如何判断训练不收敛呢？可以估计出如果模型采用不变的输出最小的cost0是多少。
+
+如果训练过程的的cost明显高于这个常数输出的cost，那么我们可以判断为训练不收敛。举一个例子，假如我们是三分类问题，采用multi-class-cross-entropy作为cost，数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass（或者更早）后，cost还大于这个数，那么可以认为训练不收敛，应该降低学习率。
+
+2. 如何设置学习率退火（learning rate annealing）
+------------------------------------------------
+
+在相应的优化算法里设置learning_rate_schedule及相关参数，以使用Adam算法为例，代码如下：
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3,
+        learning_rate_decay_a=0.5,
+        learning_rate_decay_b=0.75,
+        learning_rate_schedule="poly",)
+
+PaddlePaddle目前支持8种learning_rate_schedule，这8种learning_rate_schedule及其对应学习率计算方式如下：
+
+* "constant"
+  
+  lr = learning_rate
+
+* "poly"
+
+  lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  其中，num_samples_processed为已训练样本数，下同。
+
+* "caffe_poly"
+
+  lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  在该示例中，当已训练样本数小于等于1000时，学习率为 :code:`1e-3 * 1.0`；当已训练样本数大于1000小于等于2000时，学习率为 :code:`1e-3 * 0.9`；当已训练样本数大于2000时，学习率为 :code:`1e-3 * 0.8`。
+
+* "pass_manual"
+
+  这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  在该示例中，当已训练pass数小于等于1时，学习率为 :code:`1e-3 * 1.0`；当已训练pass数大于1小于等于2时，学习率为 :code:`1e-3 * 0.9`；当已训练pass数大于2时，学习率为 :code:`1e-3 * 0.8`。
+
+3. 如何初始化参数
+-----------------
+
+默认情况下，PaddlePaddle使用均值0，标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式，PaddlePaddle目前提供两种参数初始化的方式\:
+
+* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+比如设置一个全连接层的参数初始化方式和bias初始化方式，可以使用如下代码。
+
+..  code-block:: python
+
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+                      bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
+
+4. 如何共享参数
+---------------
+
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
+
+简单的全连接网络，参数共享的配置示例为\:
+
+..  literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
+
+5. 如何加载预训练参数
+------------------------
+
+* 对加载预训练参数的层，设置其参数属性 :code:`is_static=True`，使该层的参数在训练过程中保持不变。以embedding层为例，代码如下：
+
+..  code-block:: python
+
+    emb_para = paddle.attr.Param(name='emb', is_static=True)
+    paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* 从模型文件将预训练参数载入 :code:`numpy.array`，在创建parameters后，使用 :code:`parameters.set()` 加载预训练参数。PaddlePaddle保存的模型参数文件前16字节为头信息，用户将参数载入 :code:`numpy.array` 时须从第17字节开始。以embedding层为例，代码如下：
+
+..  code-block:: python
+
+    def load_parameter(file_name, h, w):
+        with open(file_name, 'rb') as f:
+            f.read(16)  # skip header.
+            return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    parameters = paddle.parameters.create(my_cost)
+    parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+6. 存储的参数格式是什么，如何和明文进行相互转化
+--------------------------------------------------
+
+PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中，1~4字节表示PaddlePaddle版本信息，请直接填充0；5~8字节表示每个参数占用的字节数，当保存的网络参数为float类型时为4，double类型时为8；9~16字节表示保存的参数总个数。
+
+将PaddlePaddle保存的模型参数还原回明文时，可以使用相应数据类型的 :code:`numpy.array` 加载具体网络参数，此时可以跳过PaddlePaddle模型参数文件的头信息。若在PaddlePaddle编译时，未指定按照double精度编译，默认情况下按照float精度计算，保存的参数也是float类型。这时在使用 :code:`numpy.array` 时，一般设置 :code:`dtype=float32` 。示例如下：
+
+..  code-block:: python
+
+    def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                fmt="%.6f", delimiter=",")
+
+
+将明文参数转化为PaddlePaddle可加载的模型参数时，首先构造头信息，再写入网络参数。下面的代码将随机生成的矩阵转化为可以被PaddlePaddle加载的模型参数。
+
+..  code-block:: python
+
+    def gen_rand_param(param_file, width, height, need_trans):
+        np.random.seed()
+        header = struct.pack("iil", 0, 4, height * width)
+        param = np.float32(np.random.rand(height, width))
+        with open(param_file, "w") as fparam:
+            fparam.write(header + param.tostring())
+
+7. A protocol message was rejected because it was too big
+------------------------------------------------------------
+
+如果在训练NLP相关模型时，出现以下错误：
+
+..  code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
+
+..  code-block:: python
+
+     src_dict = dict()
+     for line_count, line in enumerate(open(src_dict_path, "r")):
+        src_dict[line.strip()] = line_count
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict": src_dict})
+
+解决方案是：将字典的地址作为args传给dataprovider，然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为：
+
+..  code-block:: python
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict_path": src_dict_path})
+
+完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
+
+

From 729b2152ae7ee525767ea780473208ce2f627ba5 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 26 Sep 2017 23:44:57 -0700
Subject: [PATCH 45/51] add include op_registry.h to elementwise_op_function.h

---
 paddle/operators/elementwise_op.h          | 1 +
 paddle/operators/elementwise_op_function.h | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index d7013654fd..3082f37422 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 86f5fa376c..3eb97f60b5 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -14,7 +14,9 @@
 
 #pragma once
 #include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
+
 #include "paddle/operators/math/math_function.h"
 
 namespace paddle {

From 028973815bc58221d8a2582da1ecf05fb622de80 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 27 Sep 2017 14:59:09 +0800
Subject: [PATCH 46/51] Fix InferShape of clip op.

---
 paddle/operators/clip_op.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 316d28f174..c208724c43 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -17,6 +17,8 @@
 namespace paddle {
 namespace operators {
 
+using framework::LoDTensor;
+
 class ClipOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -28,11 +30,10 @@ class ClipOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ClipOp should not be null.");
     auto x_dims = ctx->GetInputDim("X");
-    auto max = ctx->Attrs().Get<float>("max");
-    auto min = ctx->Attrs().Get<float>("min");
+    auto max = Attr<float>("max");
+    auto min = Attr<float>("min");
     PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
     ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -43,7 +44,7 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor)The input of clip op."
-             "The input should be a k-D tensor(k > 0 and k < 7)");
+             "The dimension size of input tensor must be between [1, 9].");
     AddOutput("Out", "(Tensor)The output of clip op with shape as input(X)");
     AddAttr<AttrType>(
         "min", "(float)Minimum value, under which element is replaced by min.");

From 6da5670f6267999ba7dfade787916ef8e4dcda36 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 27 Sep 2017 15:05:46 +0800
Subject: [PATCH 47/51] revert deleting ShareLoD

---
 paddle/operators/clip_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index c208724c43..6b9f532064 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -34,6 +34,7 @@ class ClipOp : public framework::OperatorWithKernel {
     auto min = Attr<float>("min");
     PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
     ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 

From 1ad4efa8deb1413c5aa77dbbcbcd66fc8d2018c0 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 27 Sep 2017 15:38:38 +0800
Subject: [PATCH 48/51] Fix comments

---
 paddle/operators/clip_op.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 6b9f532064..b3dd060fd7 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -17,8 +17,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::LoDTensor;
-
 class ClipOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -45,7 +43,7 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor)The input of clip op."
-             "The dimension size of input tensor must be between [1, 9].");
+             "The number of dimensions must be between [1, 9].");
     AddOutput("Out", "(Tensor)The output of clip op with shape as input(X)");
     AddAttr<AttrType>(
         "min", "(float)Minimum value, under which element is replaced by min.");

From e0b17754dba8bc15c82c02a5bf6df9a07ce2238c Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 27 Sep 2017 15:28:44 +0800
Subject: [PATCH 49/51] fix atomic issue when cpu only

---
 paddle/framework/operator.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index a3f28339aa..d7beff5bc1 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/operator.h"
 #include <algorithm>
+#include <atomic>
 
 namespace paddle {
 namespace framework {

From 83310a1c14edb271050cdedaae936badc69b7422 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 27 Sep 2017 17:00:49 +0800
Subject: [PATCH 50/51] refine mkldnn benchmark script

---
 benchmark/paddle/image/run_mkldnn.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh
index b6cd6fe03b..81de1a0e91 100755
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -13,9 +13,13 @@ function train() {
     log="logs/${topology}-mkldnn-${bs}.log"
   elif [ $3 == "False" ]; then
     thread=`nproc`
+    # each trainer_count use only 1 core to avoid conflict
+    export OMP_NUM_THREADS=1
+    export MKL_NUM_THREADS=1
     log="logs/${topology}-${thread}mklml-${bs}.log"
   else
     echo "Wrong input $3, use True or False."
+    exit 0
   fi
   args="batch_size=${bs}"
   config="${topology}.py"

From c53d21420543c1ab578db30f1dd74c0ded221cdd Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 27 Sep 2017 19:44:37 +0800
Subject: [PATCH 51/51] fix compiler warning from MKLDNNLayer and so on

---
 paddle/framework/lod_tensor_test.cu                    | 4 ++--
 paddle/gserver/layers/MKLDNNConvLayer.cpp              | 2 +-
 paddle/gserver/layers/MKLDNNFcLayer.cpp                | 2 +-
 paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp | 2 +-
 paddle/operators/multiplex_op.cu                       | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index 97e69cdb2e..647d07536d 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -36,8 +36,8 @@ TEST(LoDTensor, LoDInGPU) {
   lod_tensor.mutable_data<float>(place);
 
   lod_tensor.set_lod(src_lod);
-  CHECK_EQ(lod_tensor.lod_element(0, 2), 4);
-  CHECK_EQ(lod_tensor.lod_element(0, 4), 8);
+  CHECK_EQ(lod_tensor.lod_element(0, 2), 4UL);
+  CHECK_EQ(lod_tensor.lod_element(0, 4), 8UL);
 
   auto lod = lod_tensor.lod();
 
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 9a0abd291a..0d6742e909 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -28,7 +28,7 @@ bool MKLDNNConvLayer::init(const LayerMap& layerMap,
   if (!MKLDNNLayer::init(layerMap, parameterMap)) {
     return false;
   }
-  CHECK_EQ(inputLayers_.size(), 1) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
   CHECK_EQ(inputLayers_.size(), parameters_.size());
   CHECK(config_.shared_biases()) << "Only support shared biases yet";
 
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 8cbfbd0d2b..e829456d6a 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -28,7 +28,7 @@ bool MKLDNNFcLayer::init(const LayerMap& layerMap,
     return false;
   }
 
-  CHECK_EQ(inputLayers_.size(), 1) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
   CHECK_EQ(inputLayers_.size(), parameters_.size());
   CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
 
diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
index 538d18cdc3..c922237d33 100644
--- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
@@ -228,7 +228,7 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
         curBeam.groundTruth[j] = *(start + n);
         curBeam.inBeam[j] = 1;
       } else {
-        CHECK_LE(curBeam.rowIdxInBeam[j] + 1,
+        CHECK_LE((size_t)curBeam.rowIdxInBeam[j] + 1,
                  curBeam.subSeqStartPos.size() - 1);
         int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
         int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 70e46815fc..505776612e 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -42,7 +42,7 @@ class MultiplexGPUKernel : public framework::OpKernel {
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
       PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
-      PADDLE_ENFORCE_LT(k, ins.size(),
+      PADDLE_ENFORCE_LT((size_t)k, ins.size(),
                         "index exceeds the number of candidate tensors.");
       memory::Copy(place, out->data<T>() + i * cols, place,
                    ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);