From ed72af48cecb1462b14c2105e9451052fa583cde Mon Sep 17 00:00:00 2001
From: Xinghai Sun <sunxinghai1216@gmail.com>
Date: Fri, 1 Sep 2017 19:01:12 +0800
Subject: [PATCH 01/16] Add cos_sim op.

---
 paddle/operators/CMakeLists.txt               |  2 +-
 paddle/operators/cos_sim_op.cc                | 91 ++++++++++++++++++
 paddle/operators/cos_sim_op.cu                | 22 +++++
 paddle/operators/cos_sim_op.h                 | 93 +++++++++++++++++++
 paddle/pybind/pybind.cc                       |  1 +
 .../paddle/v2/framework/tests/CMakeLists.txt  |  1 +
 .../v2/framework/tests/gradient_checker.py    | 21 +++--
 .../paddle/v2/framework/tests/op_test_util.py |  8 +-
 .../v2/framework/tests/test_cos_sim_op.py     | 40 ++++++++
 9 files changed, 267 insertions(+), 12 deletions(-)
 create mode 100644 paddle/operators/cos_sim_op.cc
 create mode 100644 paddle/operators/cos_sim_op.cu
 create mode 100644 paddle/operators/cos_sim_op.h
 create mode 100644 python/paddle/v2/framework/tests/test_cos_sim_op.py
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index e5efcccb0e..25dbd236e6 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -56,7 +56,7 @@ list(REMOVE_ITEM GENERAL_OPS
 op_library(net_op SRCS net_op.cc)
 op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
 op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
-op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc 
+op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor operator net_op)
 op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
 
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
new file mode 100644
index 0000000000..21a616522b
--- /dev/null
+++ b/paddle/operators/cos_sim_op.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/cos_sim_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class CosSimOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                      ctx.Input<Tensor>("Y")->dims(),
+                      "Dimensions of Input(X) and Input(Y) must be the same.");
+
+    auto dims = ctx.Input<Tensor>("X")->dims();
+    ctx.Output<Tensor>("Out")->Resize({dims[0], 1});
+  }
+};
+
+class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CosSimOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of cos_sim op.");
+    AddInput("Y", "The second input of cos_sim op.");
+    AddOutput("Out", "The output of cos_sim op.");
+    AddComment(R"DOC(
+Cosine Similarity Operator.
+
+The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y))
+)DOC");
+  }
+};
+
+class CosSimOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    PADDLE_ENFORCE_EQ(x_dims, y_dims,
+                      "Dimensions of Input(X) and Input(Y) must be the same.");
+    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
+                      "1st dimension of Out@GRAD must equal to Input(X)");
+    PADDLE_ENFORCE_EQ(out_dims[1], 1,
+                      "1st dimension of Out@GRAD must equal to Input(X)");
+
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    x_grad->Resize(x_dims);
+    y_grad->Resize(y_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
+            ops::CosSimOpGrad);
+REGISTER_OP_CPU_KERNEL(cos_sim,
+                       ops::CosSimKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu
new file mode 100644
index 0000000000..0cb8fd26de
--- /dev/null
+++ b/paddle/operators/cos_sim_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/cos_sim_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(cos_sim,
+                       ops::CosSimKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
new file mode 100644
index 0000000000..5247087cc1
--- /dev/null
+++ b/paddle/operators/cos_sim_op.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class CosSimKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* z = context.Output<Tensor>("Out");
+
+    z->mutable_data<T>(context.GetPlace());
+
+    auto dims = x->dims();
+    int size = static_cast<int>(framework::product(dims));
+    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
+    auto X = EigenMatrix<T>::From(*x, new_dims);
+    auto Y = EigenMatrix<T>::From(*y, new_dims);
+    auto Z = EigenMatrix<T>::From(*z, new_dims);
+
+    auto XY = (X * Y).sum(Eigen::array<int, 1>({1}));
+    auto XX = (X * X).sum(Eigen::array<int, 1>({1}));
+    auto YY = (Y * Y).sum(Eigen::array<int, 1>({1}));
+    auto place = context.GetEigenDevice<Place>();
+    Z.device(place) = XY / XX.sqrt() / YY.sqrt();
+  }
+};
+
+template <typename Place, typename T>
+class CosSimGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* z = context.Input<Tensor>("Out");
+    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    grad_x->mutable_data<T>(context.GetPlace());
+    grad_y->mutable_data<T>(context.GetPlace());
+
+    auto dims = x->dims();
+    int size = static_cast<int>(framework::product(dims));
+    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
+    auto X = EigenMatrix<T>::From(*x, new_dims);
+    auto Y = EigenMatrix<T>::From(*y, new_dims);
+    auto Z = EigenMatrix<T>::From(*z);
+    auto dX = EigenMatrix<T>::From(*grad_x, new_dims);
+    auto dY = EigenMatrix<T>::From(*grad_y, new_dims);
+    auto dZ = EigenMatrix<T>::From(*grad_z);
+
+    auto XX = (X * X).sum(Eigen::array<int, 1>({1}));
+    auto YY = (Y * Y).sum(Eigen::array<int, 1>({1}));
+    Eigen::DSizes<int, 2> bcast(1, dims[1]);
+    auto denominator_bcast = (XX.sqrt() * YY.sqrt()).broadcast(bcast);
+    auto Z_bcast = Z.broadcast(bcast);
+    auto dZ_bcast = dZ.broadcast(bcast);
+    auto place = context.GetEigenDevice<Place>();
+    dX.device(place) =
+        dZ_bcast * (Y / denominator_bcast - Z_bcast * X / XX.broadcast(bcast));
+    dY.device(place) =
+        dZ_bcast * (X / denominator_bcast - Z_bcast * Y / YY.broadcast(bcast));
+    // dX.device(place) = X;
+    // Y.device(place) = Y;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 3bc150ccb7..a8ec35dfad 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -46,6 +46,7 @@ USE_OP(lookup_table);
 USE_OP(scale);
 USE_OP_ITSELF(identity);
 USE_OP(minus);
+USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
 
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 661ebd8964..e0f77d7973 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ py_test(test_scope SRCS test_scope.py)
 
 py_test(test_tensor SRCS test_tensor.py)
 py_test(test_mul_op SRCS test_mul_op.py)
+py_test(test_cos_sim_op SRCS test_cos_sim_op.py)
 
 py_test(test_mean_op SRCS test_mean_op.py)
 
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index 518f828bac..bf01ea4876 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -36,13 +36,13 @@ def get_numeric_gradient(op,
                          in_place=False):
     """
     Get Numeric Gradient for an operator's input.
-    
-    :param op: C++ operator instance, could be an network 
-    :param input_values: The input variables. Should be an dictionary, key is 
+
+    :param op: C++ operator instance, could be an network
+    :param input_values: The input variables. Should be an dictionary, key is
     variable name. Value is numpy array.
-    :param output_name: The final output variable name. 
+    :param output_name: The final output variable name.
     :param input_to_check: The input variable need to get gradient.
-    :param delta: The perturbation value for numeric gradient method. The 
+    :param delta: The perturbation value for numeric gradient method. The
     smaller delta is, the more accurate result will get. But if that delta is
      too small, it could occur numerical stability problem.
     :param local_scope: The local scope used for get_numeric_gradient.
@@ -229,9 +229,9 @@ class GradientChecker(unittest.TestCase):
         """Use relative error for the comparison.
 
         :param numeric_grads: the numerical graidents.
-        :type numeric_grads: a list of numpy.array 
+        :type numeric_grads: a list of numpy.array
         :param analytic_grads: the analytical graidents.
-        :type analytic_grads: a list of numpy.array 
+        :type analytic_grads: a list of numpy.array
         :param name: the names of gradients, used to print for debug.
         :type names: a list of string
         :param msg_prefix: string info, used to print for debug.
@@ -304,6 +304,13 @@ class GradientChecker(unittest.TestCase):
             # get analytical gradients according to different device
             analytic_grads = self.__get_gradient(forward_op, backward_op,
                                                  input_vars, check_names, place)
+            #print(numeric_grads[0], numeric_grads[0].shape)
+            print("dim0: ", numeric_grads[0], numeric_grads[0].shape)
+            print("dim0: ", analytic_grads[0], analytic_grads[0].shape)
+            print("---------------------")
+            print("dim1: ", numeric_grads[1], numeric_grads[1].shape)
+            print("dim1: ", analytic_grads[1], analytic_grads[1].shape)
+            assert False
             self.__assert_is_close(numeric_grads, analytic_grads, check_names,
                                    max_relative_error,
                                    "Gradient Check On %s" % str(place))
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index 3bc05a0fec..a4899355b5 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -6,13 +6,13 @@ from paddle.v2.framework.op import Operator
 class OpTestMeta(type):
     """
     Operator Test ClassMeta.
-    
-    It injects `test_all` method into user's OperatorTest class, to make Python 
+
+    It injects `test_all` method into user's OperatorTest class, to make Python
     unittest module run that method.
-    
+
     The `test_all` read what value is stored in `self`. It use self's values to
     create and run a operator, and check whether that op is OK or not.
-    
+
     See `test_add_two_op` for example usage.
     """
 
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/framework/tests/test_cos_sim_op.py
new file mode 100644
index 0000000000..f3b04d25f2
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@@ -0,0 +1,40 @@
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+
+class TestCosSimOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((32, 84)).astype("float32")
+        }
+        expect = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
+                 np.linalg.norm(self.inputs['X'], axis=1) / \
+                 np.linalg.norm(self.inputs['Y'], axis=1)
+        expect = np.expand_dims(expect, 1)
+        self.outputs = {'Out': expect}
+
+
+class CosSimGradOpTest(GradientChecker):
+    def test_cos_sim(self):
+        op = create_op("cos_sim")
+        #inputs = {
+        #'X': np.random.random((2, 2)).astype("float32"),
+        #'Y': np.random.random((2, 2)).astype("float32")
+        #}
+        inputs = {
+            'X': np.array([[0.9, 0.6], [1.9, 1.6]]).astype("float32"),
+            'Y': np.array([[0.7, 0.8], [1.7, 1.8]]).astype("float32")
+        }
+        print(inputs)
+        self.check_grad(
+            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 9d2909be2abcb9b8728f7c4de7437c07a1254b8b Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Fri, 1 Sep 2017 19:27:52 +0800
Subject: [PATCH 02/16] rename add_op to add_two_op

---
 paddle/framework/CMakeLists.txt               | 2 +-
 paddle/operators/{add_op.cc => add_two_op.cc} | 2 +-
 paddle/operators/{add_op.cu => add_two_op.cu} | 2 +-
 paddle/operators/{add_op.h => add_two_op.h}   | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename paddle/operators/{add_op.cc => add_two_op.cc} (98%)
 rename paddle/operators/{add_op.cu => add_two_op.cu} (95%)
 rename paddle/operators/{add_op.h => add_two_op.h} (100%)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index c0838d9b75..e138517b6b 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -25,7 +25,7 @@ cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
 cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
+cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_two_op)
 
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_two_op.cc
similarity index 98%
rename from paddle/operators/add_op.cc
rename to paddle/operators/add_two_op.cc
index 8ab748ed71..bc99e306e0 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_two_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/add_op.h"
+#include "paddle/operators/add_two_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_two_op.cu
similarity index 95%
rename from paddle/operators/add_op.cu
rename to paddle/operators/add_two_op.cu
index cec5f558cb..acc03b2c8b 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_two_op.cu
@@ -14,7 +14,7 @@
 
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/add_op.h"
+#include "paddle/operators/add_two_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(add_two,
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_two_op.h
similarity index 100%
rename from paddle/operators/add_op.h
rename to paddle/operators/add_two_op.h

From 91215bcef907baffdf52bb7894a0a33d0253c16f Mon Sep 17 00:00:00 2001
From: Xinghai Sun <sunxinghai1216@gmail.com>
Date: Sat, 2 Sep 2017 17:25:30 +0800
Subject: [PATCH 03/16] Fix a bug causing wrong gradient results in cos_sim op.

---
 paddle/operators/cos_sim_op.cc                | 32 ++++++++++----
 paddle/operators/cos_sim_op.h                 | 34 +++++++++------
 .../v2/framework/tests/gradient_checker.py    |  7 ----
 .../v2/framework/tests/test_cos_sim_op.py     | 42 ++++++++++++-------
 4 files changed, 71 insertions(+), 44 deletions(-)

diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 21a616522b..3760d0b161 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -25,14 +25,16 @@ class CosSimOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
                       ctx.Input<Tensor>("Y")->dims(),
                       "Dimensions of Input(X) and Input(Y) must be the same.");
 
     auto dims = ctx.Input<Tensor>("X")->dims();
     ctx.Output<Tensor>("Out")->Resize({dims[0], 1});
+    ctx.Output<Tensor>("XNorm")->Resize({dims[0], 1});
+    ctx.Output<Tensor>("YNorm")->Resize({dims[0], 1});
   }
 };
 
@@ -43,6 +45,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The first input of cos_sim op.");
     AddInput("Y", "The second input of cos_sim op.");
     AddOutput("Out", "The output of cos_sim op.");
+    AddOutput("XNorm", "Row norm of the first input.").AsIntermediate();
+    AddOutput("YNorm", "Row norm of the second input.").AsIntermediate();
+
     AddComment(R"DOC(
 Cosine Similarity Operator.
 
@@ -57,20 +62,31 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("XNorm"),
+                            "Input(XNorm) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("YNorm"),
+                            "Input(YNorm) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null.");
+                            "Input(Out@GRAD) must not be null.");
 
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto xnorm_dims = ctx.Input<Tensor>("XNorm")->dims();
+    auto ynorm_dims = ctx.Input<Tensor>("YNorm")->dims();
     auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
     PADDLE_ENFORCE_EQ(x_dims, y_dims,
                       "Dimensions of Input(X) and Input(Y) must be the same.");
+    PADDLE_ENFORCE_EQ(xnorm_dims[0], x_dims[0],
+                      "1st dimension of XNorm must equal that of Input(X).");
+    PADDLE_ENFORCE_EQ(xnorm_dims[1], 1, "2st dimension of XNorm must be one.");
+    PADDLE_ENFORCE_EQ(ynorm_dims[0], y_dims[0],
+                      "1st dimension of YNorm must equal that of Input(Y).");
+    PADDLE_ENFORCE_EQ(ynorm_dims[1], 1, "2st dimension of YNorm must be one.");
     PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
-                      "1st dimension of Out@GRAD must equal to Input(X)");
-    PADDLE_ENFORCE_EQ(out_dims[1], 1,
-                      "1st dimension of Out@GRAD must equal to Input(X)");
+                      "1st dimension of Out@GRAD must equal that of Input(X)");
+    PADDLE_ENFORCE_EQ(out_dims[1], 1, "1st dimension of Out@GRAD must be one.");
 
     auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 5247087cc1..69d35d8bc2 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -31,21 +31,27 @@ class CosSimKernel : public framework::OpKernel {
     auto* x = context.Input<Tensor>("X");
     auto* y = context.Input<Tensor>("Y");
     auto* z = context.Output<Tensor>("Out");
+    auto* x_norm = context.Output<Tensor>("XNorm");
+    auto* y_norm = context.Output<Tensor>("YNorm");
 
     z->mutable_data<T>(context.GetPlace());
+    x_norm->mutable_data<T>(context.GetPlace());
+    y_norm->mutable_data<T>(context.GetPlace());
 
     auto dims = x->dims();
     int size = static_cast<int>(framework::product(dims));
     auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
     auto X = EigenMatrix<T>::From(*x, new_dims);
     auto Y = EigenMatrix<T>::From(*y, new_dims);
-    auto Z = EigenMatrix<T>::From(*z, new_dims);
+    auto Z = EigenMatrix<T>::From(*z);
+    auto XNorm = EigenMatrix<T>::From(*x_norm);
+    auto YNorm = EigenMatrix<T>::From(*y_norm);
 
-    auto XY = (X * Y).sum(Eigen::array<int, 1>({1}));
-    auto XX = (X * X).sum(Eigen::array<int, 1>({1}));
-    auto YY = (Y * Y).sum(Eigen::array<int, 1>({1}));
     auto place = context.GetEigenDevice<Place>();
-    Z.device(place) = XY / XX.sqrt() / YY.sqrt();
+    auto XY = (X * Y).sum(Eigen::array<int, 1>({1}));
+    XNorm.device(place) = (X * X).sum(Eigen::array<int, 1>({1})).sqrt();
+    YNorm.device(place) = (Y * Y).sum(Eigen::array<int, 1>({1})).sqrt();
+    Z.device(place) = XY / XNorm / YNorm;
   }
 };
 
@@ -56,6 +62,8 @@ class CosSimGradKernel : public framework::OpKernel {
     auto* x = context.Input<Tensor>("X");
     auto* y = context.Input<Tensor>("Y");
     auto* z = context.Input<Tensor>("Out");
+    auto* x_norm = context.Input<Tensor>("XNorm");
+    auto* y_norm = context.Input<Tensor>("YNorm");
     auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
     auto* grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
     auto* grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
@@ -69,23 +77,23 @@ class CosSimGradKernel : public framework::OpKernel {
     auto X = EigenMatrix<T>::From(*x, new_dims);
     auto Y = EigenMatrix<T>::From(*y, new_dims);
     auto Z = EigenMatrix<T>::From(*z);
+    auto X_norm = EigenMatrix<T>::From(*x_norm);
+    auto Y_norm = EigenMatrix<T>::From(*y_norm);
     auto dX = EigenMatrix<T>::From(*grad_x, new_dims);
     auto dY = EigenMatrix<T>::From(*grad_y, new_dims);
     auto dZ = EigenMatrix<T>::From(*grad_z);
 
-    auto XX = (X * X).sum(Eigen::array<int, 1>({1}));
-    auto YY = (Y * Y).sum(Eigen::array<int, 1>({1}));
-    Eigen::DSizes<int, 2> bcast(1, dims[1]);
-    auto denominator_bcast = (XX.sqrt() * YY.sqrt()).broadcast(bcast);
+    Eigen::DSizes<int, 2> bcast(1, new_dims[1]);
     auto Z_bcast = Z.broadcast(bcast);
     auto dZ_bcast = dZ.broadcast(bcast);
     auto place = context.GetEigenDevice<Place>();
+    auto X_snorm_bcast = X_norm.square().eval().broadcast(bcast);
+    auto Y_snorm_bcast = Y_norm.square().eval().broadcast(bcast);
+    auto norm_prod_bcast = (X_norm * Y_norm).eval().broadcast(bcast);
     dX.device(place) =
-        dZ_bcast * (Y / denominator_bcast - Z_bcast * X / XX.broadcast(bcast));
+        dZ_bcast * (Y / norm_prod_bcast - Z_bcast * X / X_snorm_bcast);
     dY.device(place) =
-        dZ_bcast * (X / denominator_bcast - Z_bcast * Y / YY.broadcast(bcast));
-    // dX.device(place) = X;
-    // Y.device(place) = Y;
+        dZ_bcast * (X / norm_prod_bcast - Z_bcast * Y / Y_snorm_bcast);
   }
 };
 
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index bf01ea4876..409b3caf33 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -304,13 +304,6 @@ class GradientChecker(unittest.TestCase):
             # get analytical gradients according to different device
             analytic_grads = self.__get_gradient(forward_op, backward_op,
                                                  input_vars, check_names, place)
-            #print(numeric_grads[0], numeric_grads[0].shape)
-            print("dim0: ", numeric_grads[0], numeric_grads[0].shape)
-            print("dim0: ", analytic_grads[0], analytic_grads[0].shape)
-            print("---------------------")
-            print("dim1: ", numeric_grads[1], numeric_grads[1].shape)
-            print("dim1: ", analytic_grads[1], analytic_grads[1].shape)
-            assert False
             self.__assert_is_close(numeric_grads, analytic_grads, check_names,
                                    max_relative_error,
                                    "Gradient Check On %s" % str(place))
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/framework/tests/test_cos_sim_op.py
index f3b04d25f2..a19be47f76 100644
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@@ -10,30 +10,40 @@ class TestCosSimOp(unittest.TestCase):
     def setUp(self):
         self.type = "cos_sim"
         self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((32, 84)).astype("float32")
+            'X': np.random.random((32, 64)).astype("float32"),
+            'Y': np.random.random((32, 64)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
         }
-        expect = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
-                 np.linalg.norm(self.inputs['X'], axis=1) / \
-                 np.linalg.norm(self.inputs['Y'], axis=1)
-        expect = np.expand_dims(expect, 1)
-        self.outputs = {'Out': expect}
 
 
 class CosSimGradOpTest(GradientChecker):
-    def test_cos_sim(self):
+    def test_cos_sim_2d(self):
+        op = create_op("cos_sim")
+        inputs = {
+            'X': np.random.random((10, 5)).astype("float32"),
+            'Y': np.random.random((10, 5)).astype("float32")
+        }
+        self.compare_grad(op, inputs)
+        self.check_grad(
+            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.05)
+
+    def test_cos_sim_3d(self):
         op = create_op("cos_sim")
-        #inputs = {
-        #'X': np.random.random((2, 2)).astype("float32"),
-        #'Y': np.random.random((2, 2)).astype("float32")
-        #}
         inputs = {
-            'X': np.array([[0.9, 0.6], [1.9, 1.6]]).astype("float32"),
-            'Y': np.array([[0.7, 0.8], [1.7, 1.8]]).astype("float32")
+            'X': np.random.random((10, 5, 2)).astype("float32"),
+            'Y': np.random.random((10, 5, 2)).astype("float32")
         }
-        print(inputs)
+        self.compare_grad(op, inputs)
         self.check_grad(
-            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
+            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.05)
 
 
 if __name__ == '__main__':

From a60128aeb235c3e53b6785537bf3f67502ca079d Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 4 Sep 2017 14:11:43 +0800
Subject: [PATCH 04/16] move identity_op from scale_op.cc to be a single file

---
 paddle/operators/CMakeLists.txt |  2 ++
 paddle/operators/identity_op.cc | 54 +++++++++++++++++++++++++++++++++
 paddle/operators/scale_op.cc    | 31 +------------------
 3 files changed, 57 insertions(+), 30 deletions(-)
 create mode 100644 paddle/operators/identity_op.cc

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index e5efcccb0e..d85c3d575a 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -48,12 +48,14 @@ add_subdirectory(math)
 
 list(REMOVE_ITEM GENERAL_OPS
      net_op
+     identity_op
      minus_op
      mul_op
      recurrent_op
      scale_op)
 
 op_library(net_op SRCS net_op.cc)
+op_library(identity_op SRCS identity_op.cc DEPS scale_op)
 op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
 op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc 
diff --git a/paddle/operators/identity_op.cc b/paddle/operators/identity_op.cc
new file mode 100644
index 0000000000..be956bf3b3
--- /dev/null
+++ b/paddle/operators/identity_op.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/scale_op.h"
+
+namespace paddle {
+namespace operators {
+
+// identity is a alias of scale op. This is also a example for creating a alias
+// operator.
+template <typename AttrType>
+class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IdentityOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input tensor of identity op");
+    AddOutput("Out", "output tensor of identity op");
+    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
+  }
+};
+
+template <typename AttrType>
+class IdentityOp : public NetOp {
+ public:
+  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(framework::OpRegistry::CreateOp(
+        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
+        {{"scale", static_cast<AttrType>(1)}}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
+                             ops::IdentityOpMaker<float>);
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 4e039688d4..57696d2ee6 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -48,7 +48,7 @@ The equation is: Out = scale*X
   }
 };
 
-// Identity Op's gradient is identity op, too.
+// Scale Op's gradient is scale op, too.
 // Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
 template <typename AttrType>
 class ScaleGradOp : public NetOp {
@@ -65,33 +65,6 @@ class ScaleGradOp : public NetOp {
   }
 };
 
-// identity is a alias of scale op. This is also a example for creating a alias
-// operator.
-template <typename AttrType>
-class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  IdentityOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "input tensor of identity op");
-    AddOutput("Out", "output tensor of identity op");
-    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
-  }
-};
-
-template <typename AttrType>
-class IdentityOp : public NetOp {
- public:
-  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(framework::OpRegistry::CreateOp(
-        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
-        {{"scale", static_cast<AttrType>(1)}}));
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -101,5 +74,3 @@ REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
             ops::ScaleGradOp<float>);
 REGISTER_OP_CPU_KERNEL(scale,
                        ops::ScaleKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
-                             ops::IdentityOpMaker<float>);

From 740c8ba12a29418eefeb22a843bebb1781f300fe Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 4 Sep 2017 16:40:23 +0800
Subject: [PATCH 05/16] remove scatter_op.cu/gather_op.cu as they support
 only_cpu now

---
 paddle/operators/CMakeLists.txt | 24 ++++++++++++++++--------
 paddle/operators/gather_op.cu   | 20 --------------------
 paddle/operators/scatter_op.cu  | 20 --------------------
 3 files changed, 16 insertions(+), 48 deletions(-)
 delete mode 100644 paddle/operators/gather_op.cu
 delete mode 100644 paddle/operators/scatter_op.cu

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index d85c3d575a..90185101c4 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -46,15 +46,20 @@ endfunction()
 
 add_subdirectory(math)
 
-list(REMOVE_ITEM GENERAL_OPS
-     net_op
-     identity_op
-     minus_op
-     mul_op
-     recurrent_op
-     scale_op)
+set(ONLYCPU_OPS
+    net_op
+    gather_op
+    scatter_op)
+foreach(src ${ONLYCPU_OPS})
+    op_library(${src} SRCS ${src}.cc)
+endforeach()
 
-op_library(net_op SRCS net_op.cc)
+set(DEPS_OPS
+    identity_op
+    minus_op
+    mul_op
+    recurrent_op
+    scale_op)
 op_library(identity_op SRCS identity_op.cc DEPS scale_op)
 op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
 op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
@@ -62,6 +67,9 @@ op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor operator net_op)
 op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
 
+list(REMOVE_ITEM GENERAL_OPS
+     ${ONLYCPU_OPS}
+     ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src} SRCS ${src}.cc ${src}.cu)
 endforeach()
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
deleted file mode 100644
index 3f04a7b3f8..0000000000
--- a/paddle/operators/gather_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/gather_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gather,
-                       ops::GatherOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
deleted file mode 100644
index 6716b47883..0000000000
--- a/paddle/operators/scatter_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/scatter_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(scatter,
-                       ops::ScatterOpKernel<paddle::platform::GPUPlace, float>);

From efd40b66cc4bb898fe152337aeda9f65ced33767 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Tue, 5 Sep 2017 14:17:13 +0800
Subject: [PATCH 06/16] update the doc for how to write the operators.

---
 doc/howto/dev/new_op_cn.md | 141 +++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 77 deletions(-)

diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 3e71a0a592..6bd54137ba 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -23,19 +23,20 @@
 - `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
 - `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
 
-依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
-
-  
- 内容            | 定义位置         
---------------  | :----------------------  
-OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
-Op定义           | `.cc`文件 
-Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`.cc`文件，GPU可在`.cu`文件。 
-注册Op           | Op注册在`.cc`文件；Kernel注册CPU在`.cc`文件，GPU在`.cu`文件
-     
-     
-下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+
+
+ 内容            | 定义位置
+--------------  | :----------------------
+OpProtoMake定义  | `*_op.cc`文件，Backward Op不需要定义OpProtoMake
+Op定义           | `*_op.cc`文件
+Kernel实现       | CPU、GPU共享Kernel在`*_op.h`文件，否则，CPU可以在`*_op.cc`文件，GPU可在`*_op.cu`文件。
+注册Op           | Op注册在`*_op.cc`文件；Kernel注册CPU在`*_op.cc`文件，GPU在`*_op.cu`文件
 
+实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+
+
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
 
 ## 实现C++类
 
@@ -43,8 +44,8 @@ Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`
 ### 1. 定义ProtoMaker类
 
 矩阵乘的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。首先定义`ProtoMaker`来描述该Op的输入、输出及注释：
-    
-```
+
+```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -59,20 +60,20 @@ The equation is: Out = X * Y
   }
 };
 ```
-   
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数包括2个：
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数包括2个参数：
 
    - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
    - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
-   
+
 构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加该Op的注释，这些函数会将对应内容添加到`OpProto`中。
 
-在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，该命名尽可能的规范。
+在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守命名规范。
+
 
-   
 再举个[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)的例子：
-   
-```
+
+```cpp
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -87,17 +88,17 @@ The equation is: Out = scale*X
   }
 };
 ```
- 
- 在这个例子里，两处不同：
- 
+
+ 这个例子有两处不同：
+
   - `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中。
   - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
-   
+
 
 ### 2. 定义Operator类
 
 
-```c++
+```cpp
 class MulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -121,20 +122,20 @@ class MulOp : public framework::OperatorWithKernel {
 ```
 
 [`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
-	 
-```c++
+
+```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
 ```
 
 这句表示使用基类`OperatorWithKernel`的构造函数，也可写成：
-   
-```c++
+
+```cpp
 MulOp(const std::string &type, const framework::VariableNameMap &inputs,
       const framework::VariableNameMap &outputs,
       const framework::AttributeMap &attrs)
   : OperatorWithKernel(type, inputs, outputs, attrs) {}
-```	
-	
+```
+
 还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
 
   - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
@@ -144,7 +145,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 ### 3. 定义OpKernel类
 
-```C++
+```cpp
 template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
  public:
@@ -163,34 +164,34 @@ class MulKernel : public framework::OpKernel {
 `MulKernel`继承自`framework::OpKernel`，带有模板参数:
 
   - `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
-  
+
  - `typename T` : 表示数据类型，如`float`, `double`等。
-   
+
 `MulKernel`需要重写`Compute`接口，该接口参数为`const framework::ExecutionContext& context`, `ExecutionContext`相比`InferShapeContext`增加了设备类型，同样可获取到输入输出和属性参数，`Compute`函数里写具体实现时。
-   
-注意，不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。`MulOp`的CPU、GPU实现共享同一个`Kernel`，`OpKernel`不共享的例子可以参考[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 
-   
+
+注意，不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。`MulOp`的CPU、GPU实现共享同一个`Kernel`，`OpKernel`不共享的例子可以参考[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+
 到此前向Op实现完成，需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似，这里不再重复。但注意，反向Op没有`ProtoMaker`。
-   
+
 ### 4. 注册Operator
 
 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
 
-```c++
+```cpp
 namespace ops = paddle::operators;
 REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
 REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(mul_grad,
               ops::MulGradKernel<paddle::platform::CPUPlace, float>);
 ```
-    
+
   - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`，
   - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
   - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
 
 在 `.cu`文件中注册GPU Kernel。
-   
-```c++
+
+```cpp
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(mul_grad,
@@ -199,56 +200,42 @@ REGISTER_OP_GPU_KERNEL(mul_grad,
 
 ### 5. 编译
 
-在[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件中添加编译。
-   
-```
-op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
-```
-   
-下面命令可以编译：
-   
+无需修改[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件，`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动加入编译。
+
+直接执行下面命令可进行编译：
+
 ```
 make mul_op
 ```
 
 ## 绑定Python
 
-- 绑定Python 
- 
-    在 [`paddle/pybind/pybind.cc 
+- 绑定Python
+
+    在 [`paddle/pybind/pybind.cc
 `](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc)文件中添加该类：
 
     ```
     USE_OP(mul);
     ```
     如果只实现了CPU版本，则使用`USE_CPU_ONLY_OP`:
-    
+
     ```
     USE_CPU_ONLY_OP(gather);
     ```
-    
+
     如果OP不带Kernel，则使用`USE_NO_KENREL_OP`:
-    
+
     ```
     USE_NO_KENREL_OP(recurrent);
     ```
-    
+
     使用`USE_OP`告知编译器需要链接该Op的目标文件，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
-    
-    
+
+
  - 生成库
 
-   在 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件添加类到`DEPS`中，使得该Op可以链接到生成的lib库中。
-   
-   ```
-   if(WITH_PYTHON)
-     cc_library(paddle_pybind SHARED
-     SRCS pybind.cc
-     DEPS pybind python backward
-     mul_op
-     minus_op)
-   endif(WITH_PYTHON)
-   ```
+   无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件，`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动被添加链接至生成的lib库中。
 
 ## 实现单元测试
 
@@ -258,7 +245,7 @@ make mul_op
 
 前向Op单测继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`，具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数，以及Python对比的输出值。
 
-```
+```python
 import unittest
 import numpy as np
 from gradient_checker import GradientChecker, create_op
@@ -276,17 +263,17 @@ class TestMulOp(unittest.TestCase):
         self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
 ```
    首先需要`import`必要的包,下面详细解释其他值：
-   
+
    - `self.type = "mul" ` : 定义类型，和注册的类型一致。
    - `self.inputs` : 定义输入，类型为Numpy.array，并初始化。
    - `self.outputs` : 定义输出，并得到Python结算结果。
 
- 
+
 ### 反向Operator单测
 
 反向Op单测继承自`GradientChecker`，而`GradientChecker`集成自`unittest.TestCase`，所以反向单测函数需要`test_`开头。
 
- ```
+ ```python
  class MulGradOpTest(GradientChecker):
     def test_mul(self):
         op = create_op("mul")
@@ -294,7 +281,7 @@ class TestMulOp(unittest.TestCase):
             'X': np.random.random((32, 84)).astype("float32"),
             'Y': np.random.random((84, 100)).astype("float32")
         }
-        self.compare_grad(op, inputs)      
+        self.compare_grad(op, inputs)
         # mul op will enlarge the relative error
         self.check_grad(
             op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
@@ -310,7 +297,7 @@ class TestMulOp(unittest.TestCase):
       - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
 
 
-### 编译和执行 
+### 编译和执行
 
 单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译：
 

From b3ff125d5598da6e362f360220eef816cf0d7f42 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Tue, 5 Sep 2017 16:29:57 +0800
Subject: [PATCH 07/16] follow comments.

---
 doc/howto/dev/new_op_cn.md | 42 +++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 6bd54137ba..85c222163c 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -28,16 +28,18 @@
 
  内容            | 定义位置
 --------------  | :----------------------
-OpProtoMake定义  | `*_op.cc`文件，Backward Op不需要定义OpProtoMake
-Op定义           | `*_op.cc`文件
-Kernel实现       | CPU、GPU共享Kernel在`*_op.h`文件，否则，CPU可以在`*_op.cc`文件，GPU可在`*_op.cu`文件。
-注册Op           | Op注册在`*_op.cc`文件；Kernel注册CPU在`*_op.cc`文件，GPU在`*_op.cu`文件
+OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
+Op定义           | `.cc`文件
+Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`.cc`文件，GPU可在`.cu`文件。
+注册Op           | Op注册在`.cc`文件；Kernel注册CPU在`.cc`文件，GPU在`.cu`文件
+
 
 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
 
 
 下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
 
+
 ## 实现C++类
 
 
@@ -200,13 +202,18 @@ REGISTER_OP_GPU_KERNEL(mul_grad,
 
 ### 5. 编译
 
-无需修改[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件，`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动加入编译。
+- 简单**无特殊依赖**的OP无需修改CMakeList.txt文件。[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt) 会自动将 `paddle/operators` 目录下新增的 `*_op.cc` 文件加入编译。
+- 较为复杂、**有额外依赖** 的operator仍需要修改[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)。如，`mul_op` 依赖 `math_function`，需要在`CMakeLists.txt`中添加如下内容：
 
-直接执行下面命令可进行编译：
+    ```
+    op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)		 +
+    ```
 
-```
-make mul_op
-```
+- 运行下面命令可以进行编译：
+
+    ```
+    make mul_op
+    ```
 
 ## 绑定Python
 
@@ -235,13 +242,13 @@ make mul_op
 
  - 生成库
 
-   无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件，`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动被添加链接至生成的lib库中。
+   无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件，`paddle/operators` 目录下新增的 `*_op.cc` 文件会自动被添加链接到生成的lib库中。
 
 ## 实现单元测试
 
 单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单测](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
 
-### 前向Operator单测
+### 前向Operator单元测试
 
 前向Op单测继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`，具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数，以及Python对比的输出值。
 
@@ -269,7 +276,7 @@ class TestMulOp(unittest.TestCase):
    - `self.outputs` : 定义输出，并得到Python结算结果。
 
 
-### 反向Operator单测
+### 反向Operator单元测试
 
 反向Op单测继承自`GradientChecker`，而`GradientChecker`集成自`unittest.TestCase`，所以反向单测函数需要`test_`开头。
 
@@ -297,21 +304,22 @@ class TestMulOp(unittest.TestCase):
       - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
 
 
-### 编译和执行
+### 编译和执行单元测试
 
-单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译：
+单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加以下内容将单测加入工程中：
 
 ```
 py_test(test_mul_op SRCS test_mul_op.py)
 ```
 
-编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`，编译成功之后执行单测命令为：
+请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单测：
 
-```
+```bash
 make test ARGS="-R test_mul_op -V"
 ```
+
 或者:
 
-```
+```bash
 ctest -R test_mul_op
 ```

From 020e45f715b1a1cea5dcbee10dacb055b3889523 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 5 Sep 2017 16:54:12 +0800
Subject: [PATCH 08/16] follow comments to revert add_two_op to add_op

---
 paddle/framework/CMakeLists.txt                           | 2 +-
 paddle/framework/grad_op_builder_test.cc                  | 4 ++--
 paddle/operators/{add_two_op.cc => add_op.cc}             | 7 +++----
 paddle/operators/{add_two_op.cu => add_op.cu}             | 5 ++---
 paddle/operators/{add_two_op.h => add_op.h}               | 0
 paddle/pybind/pybind.cc                                   | 2 +-
 python/paddle/v2/framework/tests/test_add_two_op.py       | 2 +-
 python/paddle/v2/framework/tests/test_gradient_checker.py | 2 +-
 python/paddle/v2/framework/tests/test_net.py              | 4 ++--
 python/paddle/v2/framework/tests/test_operator.py         | 4 ++--
 python/paddle/v2/framework/tests/test_recurrent_op.py     | 2 +-
 11 files changed, 16 insertions(+), 18 deletions(-)
 rename paddle/operators/{add_two_op.cc => add_op.cc} (88%)
 rename paddle/operators/{add_two_op.cu => add_op.cu} (82%)
 rename paddle/operators/{add_two_op.h => add_op.h} (100%)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index e138517b6b..c0838d9b75 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -25,7 +25,7 @@ cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
 cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_two_op)
+cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
 
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
index 902c2655e9..9e3ca563c6 100644
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -3,7 +3,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 
-USE_OP(add_two);
+USE_OP(add);
 
 namespace paddle {
 namespace framework {
@@ -41,7 +41,7 @@ namespace f = paddle::framework;
 
 TEST(GradOpBuilder, AddTwo) {
   std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "add_two", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
+      "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
   std::shared_ptr<f::OperatorBase> grad_add_op =
       f::OpRegistry::CreateGradOp(*add_op);
   EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
diff --git a/paddle/operators/add_two_op.cc b/paddle/operators/add_op.cc
similarity index 88%
rename from paddle/operators/add_two_op.cc
rename to paddle/operators/add_op.cc
index bc99e306e0..8dbd47cf0d 100644
--- a/paddle/operators/add_two_op.cc
+++ b/paddle/operators/add_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/add_two_op.h"
+#include "paddle/operators/add_op.h"
 
 namespace paddle {
 namespace operators {
@@ -57,7 +57,6 @@ class AddOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker, add_two_grad, ops::AddOpGrad);
+REGISTER_OP(add, ops::AddOp, ops::AddOpMaker, add_grad, ops::AddOpGrad);
 
-REGISTER_OP_CPU_KERNEL(add_two,
-                       ops::AddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(add, ops::AddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_two_op.cu b/paddle/operators/add_op.cu
similarity index 82%
rename from paddle/operators/add_two_op.cu
rename to paddle/operators/add_op.cu
index acc03b2c8b..fdf2c66c7c 100644
--- a/paddle/operators/add_two_op.cu
+++ b/paddle/operators/add_op.cu
@@ -14,8 +14,7 @@
 
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/add_two_op.h"
+#include "paddle/operators/add_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(add_two,
-                       ops::AddKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(add, ops::AddKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/add_two_op.h b/paddle/operators/add_op.h
similarity index 100%
rename from paddle/operators/add_two_op.h
rename to paddle/operators/add_op.h
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 6896422617..dba3d8f285 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 
 namespace py = pybind11;
 
-USE_OP(add_two);
+USE_OP(add);
 USE_OP(onehot_cross_entropy);
 USE_OP(sgd);
 USE_OP(mul);
diff --git a/python/paddle/v2/framework/tests/test_add_two_op.py b/python/paddle/v2/framework/tests/test_add_two_op.py
index 0def484edd..a578e74eca 100644
--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@@ -11,7 +11,7 @@ class TestAddOp(unittest.TestCase):
     __metaclass__ = OpTestMeta
 
     def setUp(self):
-        self.type = "add_two"
+        self.type = "add"
         self.inputs = {
             'X': numpy.random.random((102, 105)).astype("float32"),
             'Y': numpy.random.random((102, 105)).astype("float32")
diff --git a/python/paddle/v2/framework/tests/test_gradient_checker.py b/python/paddle/v2/framework/tests/test_gradient_checker.py
index e0b3151208..857427cdfb 100644
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
@@ -7,7 +7,7 @@ from gradient_checker import get_numeric_gradient
 
 class GetNumericGradientTest(unittest.TestCase):
     def test_add_op(self):
-        add_op = Operator('add_two', X="X", Y="Y", Out="Z")
+        add_op = Operator('add', X="X", Y="Y", Out="Z")
         x = numpy.random.random((10, 1)).astype("float32")
         y = numpy.random.random((10, 1)).astype("float32")
 
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
index 9339cf28da..e4b7cd480c 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -15,7 +15,7 @@ def fc(X, W, Y):
 class TestNet(unittest.TestCase):
     def test_net_all(self):
         net = core.Net.create()
-        op1 = Operator("add_two", X="X", Y="Y", Out="Out")
+        op1 = Operator("add", X="X", Y="Y", Out="Out")
         net.append_op(op1)
 
         net2 = core.Net.create()
@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase):
 
         expected = '''
 Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(add_two), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
+    Op(add), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
     Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
         Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
             Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
diff --git a/python/paddle/v2/framework/tests/test_operator.py b/python/paddle/v2/framework/tests/test_operator.py
index 1abc4eeb57..040556322d 100644
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase):
 
 class TestOpCreations(unittest.TestCase):
     def test_all(self):
-        add_op = op.Operator("add_two", X="a", Y="b", Out="z")
+        add_op = op.Operator("add", X="a", Y="b", Out="z")
         self.assertIsNotNone(add_op)
         # Invoke C++ DebugString()
-        self.assertEqual('Op(add_two), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
+        self.assertEqual('Op(add), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
                          str(add_op))
 
 
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index d6000ab9f9..22e680fd78 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -146,7 +146,7 @@ class TestRecurrentOp(unittest.TestCase):
         stepnet = core.Net.create()
         x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("add_two", X="Wx", Y="Uh", Out="sum")
+        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
         sig_op = Operator("sigmoid", X="sum", Y="h@alias")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:

From 2beaf3aa2a966980177ea0a0d309fd7a7a594a1c Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 5 Sep 2017 18:12:42 +0800
Subject: [PATCH 09/16] auto find .cc or .cu in operator/CMakeLists.txt

---
 paddle/operators/CMakeLists.txt | 54 +++++++++++++++------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 90185101c4..99b8e1967f 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -14,27 +14,31 @@ function(op_library TARGET)
     cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
             "${multiValueArgs}" ${ARGN})
 
-    foreach(src ${op_library_SRCS})
-        if (${src} MATCHES ".*\\.cu$")
-            list(APPEND cu_srcs ${src})
-        elseif(${src} MATCHES ".*\\.cc$")
-            list(APPEND cc_srcs ${src})
-        else()
-            message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+    list(LENGTH op_library_SRCS op_library_SRCS_len)
+    if (${op_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND cc_srcs ${TARGET}.cc)
         endif()
-    endforeach()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+            list(APPEND cu_srcs ${TARGET}.cc)
+        endif()
+    else()
+        foreach(src ${op_library_SRCS})
+            if (${src} MATCHES ".*\\.cu$")
+                list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cc$")
+                list(APPEND cc_srcs ${src})
+            else()
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+            endif()
+        endforeach()
+    endif()
 
     list(LENGTH cc_srcs cc_srcs_len)
     if (${cc_srcs_len} EQUAL 0)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
-    list(LENGTH cu_srcs cu_srcs_len)
-    list(LENGTH op_library_DEPS dep_len)
-    if (${cu_srcs_len} EQUAL 0 AND ${dep_len} EQUAL 0)
-        message(WARNING "The op library ${TARGET} not support GPU!")
-    endif()
-
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
@@ -46,32 +50,22 @@ endfunction()
 
 add_subdirectory(math)
 
-set(ONLYCPU_OPS
-    net_op
-    gather_op
-    scatter_op)
-foreach(src ${ONLYCPU_OPS})
-    op_library(${src} SRCS ${src}.cc)
-endforeach()
-
 set(DEPS_OPS
     identity_op
     minus_op
     mul_op
     recurrent_op
     scale_op)
-op_library(identity_op SRCS identity_op.cc DEPS scale_op)
-op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
-op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
+op_library(identity_op DEPS scale_op)
+op_library(minus_op DEPS scale_op)
+op_library(mul_op DEPS math_function)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc 
   DEPS framework_proto tensor operator net_op)
-op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
+op_library(scale_op DEPS net_op)
 
-list(REMOVE_ITEM GENERAL_OPS
-     ${ONLYCPU_OPS}
-     ${DEPS_OPS})
+list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
-    op_library(${src} SRCS ${src}.cc ${src}.cu)
+    op_library(${src})
 endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")

From 42513496f30c137e42b54fab6ce275845349bccf Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 5 Sep 2017 22:17:24 +0800
Subject: [PATCH 10/16] fix a type error in operator/CMakeLists.txt

---
 paddle/operators/CMakeLists.txt | 2 +-
 paddle/operators/add_op.cu      | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 99b8e1967f..8a0ff1eb53 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -20,7 +20,7 @@ function(op_library TARGET)
             list(APPEND cc_srcs ${TARGET}.cc)
         endif()
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-            list(APPEND cu_srcs ${TARGET}.cc)
+            list(APPEND cu_srcs ${TARGET}.cu)
         endif()
     else()
         foreach(src ${op_library_SRCS})
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index fdf2c66c7c..d9c6d20a6c 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -12,8 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
 
 namespace ops = paddle::operators;

From dc520da763f258e6ca1c79f79bb82bc55ec4cf14 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Tue, 5 Sep 2017 23:08:21 +0800
Subject: [PATCH 11/16] update doc of softmax_op.

---
 paddle/operators/softmax_op.cc | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 40c51a64c4..7d062ad67c 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -24,7 +24,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
-                   "The input of softmax op must be matrix");
+                   "The input of softmax op must be a matrix.");
     ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
@@ -34,9 +34,27 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
   SoftmaxOpMaker(framework::OpProto *proto,
                  framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "input of softmax");
-    AddOutput("Y", "output of softmax");
-    AddComment("Softmax Op");
+    AddInput("X",
+             "The input tensor of softmax. "
+             "2-D with shape [batch_size, input_feature_dimensions].");
+    AddOutput("Y", "The normalized values with the same shape as X.");
+    AddComment(R"DOC(
+The input of softmax operator is a 2-D tensor with shape N x K (N is the
+batch_size, K is the dimension of input feature). The output tensor has the
+same shape as the input tensor.
+
+For each row of the input tensor, the softmax operator squashes the
+K-dimensional vector of arbitrary real values to a K-dimensional vector of real
+values in the range [0, 1] that add up to 1. Specifically, it computes the
+exponential of the given dimension and the sum of exponential values of all
+the other dimensions in the K-dimensional vector input. Then the ratio of the
+exponential of the given dimension and the sum of exponential values of all
+the other dimensions is the output of the softmax operator.
+
+For each row `i` and each column `j` in X, we have:
+    Y[i, j] = exp(X[i, j]) / sum_j(exp(X[i, j]))
+
+)DOC");
   }
 };
 

From 11163dfc7a6c1504c42acf77bd48d677d7167941 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 6 Sep 2017 11:54:33 +0800
Subject: [PATCH 12/16] make dim int to int64_t

---
 paddle/framework/ddim.cc                   | 64 ++++++++++-----------
 paddle/framework/ddim.h                    | 20 +++----
 paddle/framework/ddim_test.cc              |  4 +-
 paddle/framework/dim.h                     | 67 +++++++++++-----------
 paddle/framework/dim_test.cu               |  4 +-
 paddle/framework/eigen.h                   |  2 +-
 paddle/framework/tensor_impl.h             |  4 +-
 paddle/operators/gaussian_random_op.cc     | 10 +++-
 paddle/operators/rnn/recurrent_op_utils.cc |  2 +-
 paddle/operators/uniform_random_op.cc      | 10 +++-
 paddle/pybind/pybind.cc                    |  2 +-
 paddle/pybind/tensor_py.h                  |  4 +-
 12 files changed, 102 insertions(+), 91 deletions(-)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index cfd3e8dfde..85b7de7974 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -21,16 +21,16 @@ namespace framework {
 /// @cond HIDDEN
 
 template <int i>
-Dim<i> make_dim(const int* d) {
+Dim<i> make_dim(const int64_t* d) {
   return Dim<i>(*d, make_dim<i - 1>(d + 1));
 }
 
 template <>
-Dim<1> make_dim<1>(const int* d) {
+Dim<1> make_dim<1>(const int64_t* d) {
   return Dim<1>(*d);
 }
 
-void make_ddim(DDim& ddim, const int* dims, int n) {
+void make_ddim(DDim& ddim, const int64_t* dims, int n) {
   switch (n) {
     case 1:
       ddim = make_dim<1>(dims);
@@ -67,13 +67,13 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
 
 /// @endcond
 
-DDim make_ddim(std::initializer_list<int> dims) {
+DDim make_ddim(std::initializer_list<int64_t> dims) {
   DDim result(make_dim(0));
   make_ddim(result, dims.begin(), dims.size());
   return result;
 }
 
-DDim make_ddim(const std::vector<int>& dims) {
+DDim make_ddim(const std::vector<int64_t>& dims) {
   DDim result(make_dim(0));
   make_ddim(result, &dims[0], dims.size());
   return result;
@@ -81,12 +81,12 @@ DDim make_ddim(const std::vector<int>& dims) {
 
 /// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
-class DynamicMutableIndexer : public boost::static_visitor<int&> {
+class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
  public:
   explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
 
   template <int D>
-  int& operator()(Dim<D>& dim) const {
+  int64_t& operator()(Dim<D>& dim) const {
     return dim[idx_];
   }
 
@@ -94,12 +94,12 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
   int idx_;
 };
 
-class DynamicConstIndexer : public boost::static_visitor<int> {
+class DynamicConstIndexer : public boost::static_visitor<int64_t> {
  public:
   explicit DynamicConstIndexer(int idx) : idx_(idx) {}
 
   template <int D>
-  int operator()(const Dim<D>& dim) const {
+  int64_t operator()(const Dim<D>& dim) const {
     return dim[idx_];
   }
 
@@ -109,22 +109,22 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
 
 /// @endcond
 
-int& DDim::operator[](int idx) {
+int64_t& DDim::operator[](int idx) {
   return boost::apply_visitor(DynamicMutableIndexer(idx), var);
 }
 
-int DDim::operator[](int idx) const {
+int64_t DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
-ssize_t DDim::size() const { return arity(*this); }
+int64_t DDim::size() const { return arity(*this); }
 
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
     return false;
   } else {
-    std::vector<int> v1 = vectorize(*this);
-    std::vector<int> v2 = vectorize(d);
+    std::vector<int64_t> v1 = vectorize(*this);
+    std::vector<int64_t> v2 = vectorize(d);
 
     for (unsigned int i = 0; i < v1.size(); i++) {
       if (v1[i] != v2[i]) {
@@ -139,10 +139,10 @@ bool DDim::operator==(DDim d) const {
 bool DDim::operator!=(DDim d) const { return !(*this == d); }
 
 DDim DDim::operator+(DDim d) const {
-  std::vector<int> v1 = vectorize(*this);
-  std::vector<int> v2 = vectorize(d);
+  std::vector<int64_t> v1 = vectorize(*this);
+  std::vector<int64_t> v2 = vectorize(d);
 
-  std::vector<int> v3;
+  std::vector<int64_t> v3;
 
   assert(v1.size() == v2.size());
 
@@ -154,10 +154,10 @@ DDim DDim::operator+(DDim d) const {
 }
 
 DDim DDim::operator*(DDim d) const {
-  std::vector<int> v1 = vectorize(*this);
-  std::vector<int> v2 = vectorize(d);
+  std::vector<int64_t> v1 = vectorize(*this);
+  std::vector<int64_t> v2 = vectorize(d);
 
-  std::vector<int> v3;
+  std::vector<int64_t> v3;
 
   assert(v1.size() == v2.size());
 
@@ -168,15 +168,15 @@ DDim DDim::operator*(DDim d) const {
   return make_ddim(v3);
 }
 
-int get(const DDim& ddim, int idx) { return ddim[idx]; }
+int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
 
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
 
 /// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int>& vector;
+  std::vector<int64_t>& vector;
 
-  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -188,31 +188,31 @@ struct VectorizeVisitor : public boost::static_visitor<> {
 };
 /// @endcond
 
-std::vector<int> vectorize(const DDim& ddim) {
-  std::vector<int> result;
+std::vector<int64_t> vectorize(const DDim& ddim) {
+  std::vector<int64_t> result;
   VectorizeVisitor visitor(result);
   boost::apply_visitor(visitor, ddim);
   return result;
 }
 
-struct ProductVisitor : public boost::static_visitor<ssize_t> {
+struct ProductVisitor : public boost::static_visitor<int64_t> {
   template <int D>
-  ssize_t operator()(const Dim<D>& dim) {
+  int64_t operator()(const Dim<D>& dim) {
     return product(dim);
   }
 };
 
-ssize_t product(const DDim& ddim) {
+int64_t product(const DDim& ddim) {
   ProductVisitor visitor;
   return boost::apply_visitor(visitor, ddim);
 }
 
 struct SliceVectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int>& vector;
+  std::vector<int64_t>& vector;
   int begin;
   int end;
 
-  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
       : vector(v), begin(b), end(e) {
     PADDLE_ENFORCE(begin < end,
                    "Begin index must be less than end index in ddim slice.");
@@ -240,7 +240,7 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
 };
 
 DDim slice_ddim(const DDim& dim, int begin, int end) {
-  std::vector<int> vec;
+  std::vector<int64_t> vec;
   vec.reserve(end - begin);
   SliceVectorizeVisitor visitor(vec, begin, end);
   boost::apply_visitor(visitor, dim);
@@ -280,7 +280,7 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
-DDim::DDim(std::initializer_list<int> init_list) {
+DDim::DDim(std::initializer_list<int64_t> init_list) {
   *this = make_ddim(init_list);
 }
 }  // namespace framework
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 95f294b627..db30c52394 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -40,7 +40,7 @@ struct DDim {
   template <int D>
   explicit DDim(const Dim<D>& in) : var(in) {}
 
-  /*implicit*/ DDim(std::initializer_list<int> init_list);
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
 
   template <int D>
   DDim& operator=(const Dim<D>& in) {
@@ -48,8 +48,8 @@ struct DDim {
     return *this;
   }
 
-  int& operator[](int idx);
-  int operator[](int idx) const;
+  int64_t& operator[](int idx);
+  int64_t operator[](int idx) const;
 
   template <typename Visitor>
   typename Visitor::result_type apply_visitor(Visitor& visitor) {
@@ -71,15 +71,15 @@ struct DDim {
 
   DDim operator*(DDim d) const;
 
-  ssize_t size() const;
+  int64_t size() const;
 };
 
 /**
- * \brief Make a DDim from std::vector<int>
+ * \brief Make a DDim from std::vector<int64_t>
  *
  * \param dims An vector of ints. Must be sized between [1, 9]
  */
-DDim make_ddim(const std::vector<int>& dims);
+DDim make_ddim(const std::vector<int64_t>& dims);
 
 /**
  * \brief Make a DDim from an initializer list
@@ -87,14 +87,14 @@ DDim make_ddim(const std::vector<int>& dims);
  * \param dims An initializer list of ints. Must be sized between [1, 9]
  *
  */
-DDim make_ddim(std::initializer_list<int> dims);
+DDim make_ddim(std::initializer_list<int64_t> dims);
 
-int get(const DDim& dim, int idx);
+int64_t get(const DDim& dim, int idx);
 void set(DDim& dim, int idx, int val);
 
-std::vector<int> vectorize(const DDim& ddim);
+std::vector<int64_t> vectorize(const DDim& ddim);
 
-ssize_t product(const DDim& ddim);
+int64_t product(const DDim& ddim);
 
 /**
  * \brief Slice a ddim
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 9d18a2972c..756232b1b5 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -12,7 +12,7 @@ TEST(DDim, Equality) {
   EXPECT_EQ(ddim[2], 5);
 
   // construct a DDim from a vector
-  std::vector<int> vec({9, 1, 5});
+  std::vector<int64_t> vec({9, 1, 5});
   paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
   EXPECT_EQ(ddim[0], 9);
   EXPECT_EQ(ddim[1], 1);
@@ -25,7 +25,7 @@ TEST(DDim, Equality) {
   EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
 
   // vectorize a DDim
-  std::vector<int> res_vec = paddle::framework::vectorize(vddim);
+  std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
   EXPECT_EQ(res_vec[0], 9);
   EXPECT_EQ(res_vec[1], 1);
   EXPECT_EQ(res_vec[2], 5);
diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index 883fdc55eb..04d4b0e604 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -17,13 +17,13 @@ struct Dim {
   static constexpr int dimensions = i;
 
   template <typename... Args>
-  HOSTDEVICE Dim(int _head, Args... _tail) : head(_head), tail(_tail...) {
+  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
     static_assert(sizeof...(_tail) == i - 1,
                   "Dim initialized with the wrong number of parameters");
   }
 
   HOSTDEVICE
-  Dim(int _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
 
   HOSTDEVICE
   Dim() : head(0), tail() {}
@@ -31,12 +31,12 @@ struct Dim {
   /** Construct a Dim from a linear index and size.  Uses Fortran order
    * indexing. */
   HOSTDEVICE
-  Dim(int idx, const Dim<i>& size)
+  Dim(int64_t idx, const Dim<i>& size)
       : head(idx % size.head), tail(idx / size.head, size.tail) {}
 
   /** Construct a Dim with each dimension set to the given index */
   HOSTDEVICE
-  Dim(int idx) : head(idx), tail(idx) {}
+  Dim(int64_t idx) : head(idx), tail(idx) {}
 
   HOSTDEVICE
   bool operator==(const Dim<i>& o) const {
@@ -47,13 +47,13 @@ struct Dim {
   bool operator!=(const Dim<i>& o) const { return !(*this == o); }
 
   HOSTDEVICE
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
   HOSTDEVICE
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
 
   HOST std::string to_string() const;
 
-  int head;
+  int64_t head;
   Dim<i - 1> tail;
 };
 
@@ -63,7 +63,7 @@ struct Dim<1> {
   static constexpr int dimensions = 1;
 
   HOSTDEVICE
-  Dim(int _head) : head(_head) {}
+  Dim(int64_t _head) : head(_head) {}
 
   HOSTDEVICE
   Dim() : head(0) {}
@@ -86,11 +86,11 @@ struct Dim<1> {
   bool operator!=(const Dim<1>& o) const { return !(*this == o); }
 
   HOSTDEVICE
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
   HOSTDEVICE
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
 
-  int head;
+  int64_t head;
 };
 
 namespace {
@@ -100,12 +100,12 @@ template <int i>
 struct DimGetter {
   // Return a copy if Dim is const
   template <typename D>
-  HOSTDEVICE static int impl(const D& d) {
+  HOSTDEVICE static int64_t impl(const D& d) {
     return DimGetter<i - 1>::impl(d.tail);
   }
   // Return a reference if Dim is mutable
   template <typename D>
-  HOSTDEVICE static int& impl(D& d) {
+  HOSTDEVICE static int64_t& impl(D& d) {
     return DimGetter<i - 1>::impl(d.tail);
   }
 };
@@ -115,18 +115,18 @@ template <>
 struct DimGetter<0> {
   // Return a copy if Dim is const
   template <typename D>
-  HOSTDEVICE static int impl(const D& d) {
+  HOSTDEVICE static int64_t impl(const D& d) {
     return d.head;
   }
   // Return a reference if Dim is mutable
   template <typename D>
-  HOSTDEVICE static int& impl(D& d) {
+  HOSTDEVICE static int64_t& impl(D& d) {
     return d.head;
   }
 };
 
 template <int D>
-HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
+HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx < 0) {
     throw std::invalid_argument("Tried to access a negative dimension");
@@ -141,7 +141,7 @@ HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx != 0) {
     throw std::invalid_argument("Invalid index");
@@ -153,7 +153,7 @@ HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
 }
 
 template <int D>
-HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
+HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx < 0) {
     throw std::invalid_argument("Tried to access a negative dimension");
@@ -168,7 +168,7 @@ HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx != 0) {
     throw std::invalid_argument("Invalid index");
@@ -182,73 +182,76 @@ HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
 }  // namespace
 // Static access to constant Dim
 template <int i, int l>
-HOSTDEVICE int get(const Dim<l>& d) {
+HOSTDEVICE int64_t get(const Dim<l>& d) {
   return DimGetter<i>::impl(d);
 }
 
 // Static access to mutable Dim
 template <int i, int l>
-HOSTDEVICE int& get(Dim<l>& d) {
+HOSTDEVICE int64_t& get(Dim<l>& d) {
   return DimGetter<i>::impl(d);
 }
 
 // Dynamic access to constant Dim
 template <int l>
-HOSTDEVICE int Dim<l>::operator[](int i) const {
+HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
   return indexer(*this, i);
 }
 
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE int& Dim<l>::operator[](int i) {
+HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
   return indexer(*this, i);
 }
 
 // Dynamic access to constant Dim
-inline HOSTDEVICE int Dim<1>::operator[](int i) const {
+inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
   return indexer(*this, i);
 }
 
 // Dynamic access to mutable Dim
-inline HOSTDEVICE int& Dim<1>::operator[](int i) { return indexer(*this, i); }
+inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
+  return indexer(*this, i);
+}
 
 // Dynamic access to constant Dim
 // without std::enable_if will try to instantiate this on get<0>(d)
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int>::type get(const Dim<l>& d,
-                                                           int i) {
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
+                                                               int i) {
   return d[i];
 }
 
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int&>::type get(Dim<l>& d, int i) {
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
+                                                                int i) {
   return d[i];
 }
 
 // Dot product of two dims
 template <int i>
-HOSTDEVICE int linearize(const Dim<i>& a, const Dim<i>& b) {
+HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
   return a.head * b.head + linearize(a.tail, b.tail);
 }
 
 // Base case dot product of two Dims
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int linearize(const Dim<1>& a, const Dim<1>& b) {
+HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
   return a.head * b.head;
 }
 
 // Product of a Dim
 template <int i>
-HOSTDEVICE int product(const Dim<i>& a, int prod = 1) {
+HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
   return prod * a.head * product(a.tail);
 }
 
 // Base case product of a Dim
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int product(const Dim<1>& a, int prod) {
+HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
   return prod * a.head;
 }
 
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 3898d0a447..f42b1eeadf 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -47,9 +47,9 @@ TEST(Dim, Equality) {
   EXPECT_EQ(b[1], 11);
 
   // dynamic access on GPU
-  thrust::device_vector<int> r(1);
+  thrust::device_vector<int64_t> r(1);
   dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
-  int res = r[0];
+  int64_t res = r[0];
   EXPECT_EQ(res, 6);
 
   // ex_prefix_mul
diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h
index a4667cc51f..2d8d9ae10c 100644
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@@ -28,7 +28,7 @@ struct EigenDim {
   static Type From(const DDim& dims) {
     PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
     Type ret;
-    for (int d = 0; d < arity(dims); d++) {
+    for (int64_t d = 0; d < arity(dims); d++) {
       ret[d] = dims[d];
     }
     return ret;
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 7893e233b7..94f436294f 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -58,7 +58,7 @@ inline T* Tensor::mutable_data(platform::Place place) {
                     "Tensor's numel must be larger than zero to call "
                     "Tensor::mutable_data. Call Tensor::set_dim first.");
   /* some versions of boost::variant don't have operator!= */
-  size_t size = product(dims_) * sizeof(T);
+  int64_t size = product(dims_) * sizeof(T);
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
     if (platform::is_cpu_place(place)) {
@@ -131,7 +131,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
   PADDLE_ENFORCE_LT(begin_idx, end_idx,
                     "Begin index must be less than end index.");
   PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1.");
-  int base = product(dims_) / dims_[0];
+  size_t base = product(dims_) / dims_[0];
   Tensor dst;
   dst.holder_ = holder_;
   DDim dst_dims = dims_;
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 056447901d..583c9e5774 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -31,8 +31,8 @@ class CPUGaussianRandomKernel : public framework::OpKernel {
     }
     engine.seed(seed);
     std::normal_distribution<T> dist(mean, std);
-    ssize_t size = framework::product(tensor->dims());
-    for (ssize_t i = 0; i < size; ++i) {
+    int64_t size = framework::product(tensor->dims());
+    for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
   }
@@ -46,9 +46,13 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   void InferShape(const framework::InferShapeContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     auto dims = GetAttr<std::vector<int>>("dims");
+    std::vector<int64_t> temp(dims.size());
+    for (auto dim : dims) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
     PADDLE_ENFORCE(dims.size() > 0UL,
                    "dims can be one int or array. dims must be set.");
-    tensor->Resize(framework::make_ddim(dims));
+    tensor->Resize(framework::make_ddim(temp));
   }
 };
 
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index a9b65c30f2..69e723b401 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -61,7 +61,7 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
       PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
                      outlinks[i].internal);
       f::DDim step_dims = step_scope_var->template GetMutable<Tensor>()->dims();
-      std::vector<int> dims_vec = vectorize(step_dims);
+      std::vector<int64_t> dims_vec = vectorize(step_dims);
       dims_vec.insert(dims_vec.begin(), seq_len);
       output->Resize(f::make_ddim(dims_vec));
     } else {
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 2d943c4508..3241d4289a 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -35,8 +35,8 @@ class CPUUniformRandomKernel : public framework::OpKernel {
     std::uniform_real_distribution<T> dist(
         static_cast<T>(context.GetAttr<float>("min")),
         static_cast<T>(context.GetAttr<float>("max")));
-    ssize_t size = framework::product(tensor->dims());
-    for (ssize_t i = 0; i < size; ++i) {
+    int64_t size = framework::product(tensor->dims());
+    for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
   }
@@ -52,7 +52,11 @@ class UniformRandomOp : public framework::OperatorWithKernel {
                    "uniform_random's min must less then max");
     auto* tensor = ctx.Output<framework::Tensor>("Out");
     auto dims = GetAttr<std::vector<int>>("dims");
-    tensor->Resize(framework::make_ddim(dims));
+    std::vector<int64_t> temp(dims.size());
+    for (auto dim : dims) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
+    tensor->Resize(framework::make_ddim(temp));
   }
 };
 
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 6896422617..7db629a68f 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -76,7 +76,7 @@ PYBIND11_PLUGIN(core) {
       .def("get_dims",
            [](const Tensor &self) { return vectorize(self.dims()); })
       .def("set_dims",
-           [](Tensor &self, const std::vector<int> &dim) {
+           [](Tensor &self, const std::vector<int64_t> &dim) {
              self.Resize(make_ddim(dim));
            })
       .def("alloc_float",
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 39ba60b4dc..95171acf72 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -85,7 +85,7 @@ void PyCPUTensorSetFromArray(
     framework::Tensor &self,
     py::array_t<T, py::array::c_style | py::array::forcecast> array,
     paddle::platform::CPUPlace &place) {
-  std::vector<int> dims;
+  std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
     dims.push_back((int)array.shape()[i]);
@@ -102,7 +102,7 @@ void PyCUDATensorSetFromArray(
     framework::Tensor &self,
     py::array_t<T, py::array::c_style | py::array::forcecast> array,
     paddle::platform::GPUPlace &place) {
-  std::vector<int> dims;
+  std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
     dims.push_back((int)array.shape()[i]);

From 857cd24a35bef9c182292ca709fe1dd9d0e52e19 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 6 Sep 2017 12:18:23 +0800
Subject: [PATCH 13/16] fix dim_test.cu bug

---
 paddle/framework/dim_test.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index f42b1eeadf..0a6a87669c 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -8,7 +8,7 @@ __global__ void test(paddle::framework::Dim<2>* o) {
   o[0] = paddle::framework::make_dim(5, 6);
 }
 
-__global__ void dyn_idx_gpu(int* o) {
+__global__ void dyn_idx_gpu(int64_t* o) {
   auto d = paddle::framework::make_dim(5, 6);
   o[0] = d[1];
 }

From a5f1e6d6bdd094253cd6f47bc7bbefb1fbc3b1e4 Mon Sep 17 00:00:00 2001
From: Xinghai Sun <sunxinghai1216@gmail.com>
Date: Wed, 6 Sep 2017 12:58:00 +0800
Subject: [PATCH 14/16] Update cos_sim operator by following reviewer's
 comments.

---
 paddle/operators/cos_sim_op.cc                |  4 +-
 paddle/operators/cos_sim_op.h                 | 97 ++++++++++---------
 .../v2/framework/tests/test_cos_sim_op.py     | 38 +++++---
 3 files changed, 76 insertions(+), 63 deletions(-)

diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 3760d0b161..c033af3b74 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -90,8 +90,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 
     auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    x_grad->Resize(x_dims);
-    y_grad->Resize(y_dims);
+    if (x_grad) x_grad->Resize(x_dims);
+    if (y_grad) y_grad->Resize(y_dims);
   }
 };
 
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 69d35d8bc2..9e3ff26815 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -28,30 +28,30 @@ template <typename Place, typename T>
 class CosSimKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    auto* x_norm = context.Output<Tensor>("XNorm");
-    auto* y_norm = context.Output<Tensor>("YNorm");
+    auto* input_x = context.Input<Tensor>("X");
+    auto* input_y = context.Input<Tensor>("Y");
+    auto* output_z = context.Output<Tensor>("Out");
+    auto* output_x_norm = context.Output<Tensor>("XNorm");
+    auto* output_y_norm = context.Output<Tensor>("YNorm");
 
-    z->mutable_data<T>(context.GetPlace());
-    x_norm->mutable_data<T>(context.GetPlace());
-    y_norm->mutable_data<T>(context.GetPlace());
+    output_z->mutable_data<T>(context.GetPlace());
+    output_x_norm->mutable_data<T>(context.GetPlace());
+    output_y_norm->mutable_data<T>(context.GetPlace());
 
-    auto dims = x->dims();
+    auto dims = input_x->dims();
     int size = static_cast<int>(framework::product(dims));
     auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
-    auto X = EigenMatrix<T>::From(*x, new_dims);
-    auto Y = EigenMatrix<T>::From(*y, new_dims);
-    auto Z = EigenMatrix<T>::From(*z);
-    auto XNorm = EigenMatrix<T>::From(*x_norm);
-    auto YNorm = EigenMatrix<T>::From(*y_norm);
+    auto x = EigenMatrix<T>::From(*input_x, new_dims);
+    auto y = EigenMatrix<T>::From(*input_y, new_dims);
+    auto z = EigenMatrix<T>::From(*output_z);
+    auto x_norm = EigenMatrix<T>::From(*output_x_norm);
+    auto y_norm = EigenMatrix<T>::From(*output_y_norm);
 
     auto place = context.GetEigenDevice<Place>();
-    auto XY = (X * Y).sum(Eigen::array<int, 1>({1}));
-    XNorm.device(place) = (X * X).sum(Eigen::array<int, 1>({1})).sqrt();
-    YNorm.device(place) = (Y * Y).sum(Eigen::array<int, 1>({1})).sqrt();
-    Z.device(place) = XY / XNorm / YNorm;
+    auto xy = (x * y).sum(Eigen::array<int, 1>({1}));
+    x_norm.device(place) = x.square().sum(Eigen::array<int, 1>({1})).sqrt();
+    y_norm.device(place) = y.square().sum(Eigen::array<int, 1>({1})).sqrt();
+    z.device(place) = xy / x_norm / y_norm;
   }
 };
 
@@ -59,41 +59,44 @@ template <typename Place, typename T>
 class CosSimGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Input<Tensor>("Out");
-    auto* x_norm = context.Input<Tensor>("XNorm");
-    auto* y_norm = context.Input<Tensor>("YNorm");
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
-    auto* grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_x = context.Input<Tensor>("X");
+    auto* input_y = context.Input<Tensor>("Y");
+    auto* input_z = context.Input<Tensor>("Out");
+    auto* input_x_norm = context.Input<Tensor>("XNorm");
+    auto* input_y_norm = context.Input<Tensor>("YNorm");
+    auto* output_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* input_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
 
-    grad_x->mutable_data<T>(context.GetPlace());
-    grad_y->mutable_data<T>(context.GetPlace());
-
-    auto dims = x->dims();
+    auto dims = input_x->dims();
     int size = static_cast<int>(framework::product(dims));
     auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
-    auto X = EigenMatrix<T>::From(*x, new_dims);
-    auto Y = EigenMatrix<T>::From(*y, new_dims);
-    auto Z = EigenMatrix<T>::From(*z);
-    auto X_norm = EigenMatrix<T>::From(*x_norm);
-    auto Y_norm = EigenMatrix<T>::From(*y_norm);
-    auto dX = EigenMatrix<T>::From(*grad_x, new_dims);
-    auto dY = EigenMatrix<T>::From(*grad_y, new_dims);
-    auto dZ = EigenMatrix<T>::From(*grad_z);
+    auto x = EigenMatrix<T>::From(*input_x, new_dims);
+    auto y = EigenMatrix<T>::From(*input_y, new_dims);
+    auto z = EigenMatrix<T>::From(*input_z);
+    auto x_norm = EigenMatrix<T>::From(*input_x_norm);
+    auto y_norm = EigenMatrix<T>::From(*input_y_norm);
+    auto dz = EigenMatrix<T>::From(*input_grad_z);
 
     Eigen::DSizes<int, 2> bcast(1, new_dims[1]);
-    auto Z_bcast = Z.broadcast(bcast);
-    auto dZ_bcast = dZ.broadcast(bcast);
+    auto z_bcast = z.broadcast(bcast);
+    auto dz_bcast = dz.broadcast(bcast);
     auto place = context.GetEigenDevice<Place>();
-    auto X_snorm_bcast = X_norm.square().eval().broadcast(bcast);
-    auto Y_snorm_bcast = Y_norm.square().eval().broadcast(bcast);
-    auto norm_prod_bcast = (X_norm * Y_norm).eval().broadcast(bcast);
-    dX.device(place) =
-        dZ_bcast * (Y / norm_prod_bcast - Z_bcast * X / X_snorm_bcast);
-    dY.device(place) =
-        dZ_bcast * (X / norm_prod_bcast - Z_bcast * Y / Y_snorm_bcast);
+    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast);
+    auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast);
+    auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast);
+    if (output_grad_x) {
+      output_grad_x->mutable_data<T>(context.GetPlace());
+      auto dx = EigenMatrix<T>::From(*output_grad_x, new_dims);
+      dx.device(place) =
+          dz_bcast * (y / norm_prod_bcast - z_bcast * x / x_snorm_bcast);
+    }
+    if (output_grad_y) {
+      output_grad_y->mutable_data<T>(context.GetPlace());
+      auto dy = EigenMatrix<T>::From(*output_grad_y, new_dims);
+      dy.device(place) =
+          dz_bcast * (x / norm_prod_bcast - z_bcast * y / y_snorm_bcast);
+    }
   }
 };
 
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/framework/tests/test_cos_sim_op.py
index a19be47f76..32013a7999 100644
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@@ -24,26 +24,36 @@ class TestCosSimOp(unittest.TestCase):
         }
 
 
-class CosSimGradOpTest(GradientChecker):
-    def test_cos_sim_2d(self):
-        op = create_op("cos_sim")
-        inputs = {
+class TestCosSimGradOp(GradientChecker):
+    def setUp(self):
+        self.op = create_op("cos_sim")
+        self.inputs = {
             'X': np.random.random((10, 5)).astype("float32"),
             'Y': np.random.random((10, 5)).astype("float32")
         }
-        self.compare_grad(op, inputs)
+
+    def test_cpu_gpu_compare(self):
+        self.compare_grad(self.op, self.inputs)
+
+    def test_normal(self):
         self.check_grad(
-            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.05)
+            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.05)
 
-    def test_cos_sim_3d(self):
-        op = create_op("cos_sim")
-        inputs = {
-            'X': np.random.random((10, 5, 2)).astype("float32"),
-            'Y': np.random.random((10, 5, 2)).astype("float32")
-        }
-        self.compare_grad(op, inputs)
+    def test_ignore_x(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["Y"],
+            "Out",
+            max_relative_error=0.05,
+            no_grad_set={"X"})
+
+    def test_ignore_y(self):
         self.check_grad(
-            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.05)
+            self.op,
+            self.inputs, ["X"],
+            "Out",
+            max_relative_error=0.05,
+            no_grad_set={"Y"})
 
 
 if __name__ == '__main__':

From 52f2bc1f9d84a05f1c73428124a1235fb141f193 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 6 Sep 2017 13:34:01 +0800
Subject: [PATCH 15/16] fix vector size error

---
 paddle/operators/gaussian_random_op.cc | 3 ++-
 paddle/operators/uniform_random_op.cc  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 583c9e5774..8bb61275ba 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -46,7 +46,8 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   void InferShape(const framework::InferShapeContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     auto dims = GetAttr<std::vector<int>>("dims");
-    std::vector<int64_t> temp(dims.size());
+    std::vector<int64_t> temp;
+    temp.reserve(dims.size());
     for (auto dim : dims) {
       temp.push_back(static_cast<int64_t>(dim));
     }
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 3241d4289a..40cef8942a 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -52,7 +52,8 @@ class UniformRandomOp : public framework::OperatorWithKernel {
                    "uniform_random's min must less then max");
     auto* tensor = ctx.Output<framework::Tensor>("Out");
     auto dims = GetAttr<std::vector<int>>("dims");
-    std::vector<int64_t> temp(dims.size());
+    std::vector<int64_t> temp;
+    temp.reserve(dims.size());
     for (auto dim : dims) {
       temp.push_back(static_cast<int64_t>(dim));
     }

From 1e419d9ed3610c95b518982a026666ea3f699f27 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 6 Sep 2017 17:24:47 +0800
Subject: [PATCH 16/16] fix clang build and run error

---
 paddle/operators/cos_sim_op.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 9e3ff26815..9e2bcebe3b 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -23,6 +23,9 @@ using Tensor = framework::Tensor;
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
 class CosSimKernel : public framework::OpKernel {
@@ -43,14 +46,14 @@ class CosSimKernel : public framework::OpKernel {
     auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
     auto x = EigenMatrix<T>::From(*input_x, new_dims);
     auto y = EigenMatrix<T>::From(*input_y, new_dims);
-    auto z = EigenMatrix<T>::From(*output_z);
-    auto x_norm = EigenMatrix<T>::From(*output_x_norm);
-    auto y_norm = EigenMatrix<T>::From(*output_y_norm);
+    auto z = EigenVector<T>::Flatten(*output_z);
+    auto x_norm = EigenVector<T>::Flatten(*output_x_norm);
+    auto y_norm = EigenVector<T>::Flatten(*output_y_norm);
 
     auto place = context.GetEigenDevice<Place>();
-    auto xy = (x * y).sum(Eigen::array<int, 1>({1}));
-    x_norm.device(place) = x.square().sum(Eigen::array<int, 1>({1})).sqrt();
-    y_norm.device(place) = y.square().sum(Eigen::array<int, 1>({1})).sqrt();
+    auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
+    x_norm.device(place) = x.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
+    y_norm.device(place) = y.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
     z.device(place) = xy / x_norm / y_norm;
   }
 };