From 264b644718c14da348114bb9a44afddcd7166f11 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 2 Aug 2017 21:26:29 +0800
Subject: [PATCH 01/17] "add rowwise add backward op"

---
 paddle/operators/rowwise_add_op.cc | 15 +++++++++++++++
 paddle/operators/rowwise_add_op.h  | 19 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 2ad2b66c8f..cc763a8cf4 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -46,6 +46,17 @@ for i in xrange(X.shape[0]):
 )DOC");
   }
 };
+class RowWiseAddGradOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 4UL,
+                   "RowWiseAddGrad inputs is I, O, OG, size must be 4");
+    PADDLE_ENFORCE(ctx.OutputSize() == 2,
+                   "RowWiseAddGrad output is IG, size must be 2");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
+    ctx.Output<Tensor>(1)->Resize(ctx.Input<Tensor>(1)->dims());
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
@@ -53,3 +64,7 @@ for i in xrange(X.shape[0]):
 REGISTER_OP(rowwise_add, ops::RowWiseAddOp, ops::RowWiseAddOpMaker);
 REGISTER_OP_CPU_KERNEL(rowwise_add,
                        ops::RowWiseAddKernel<ops::CPUPlace, float>);
+
+REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, ops::RowWiseAddGradOp);
+REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
+                       ops::RowWiseAddGradKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index b86dd54634..940459e0f1 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -38,5 +38,24 @@ public:
   }
 };
 
+template <typename Place, typename T>
+class RowWiseAddGradKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& context) const override {
+    auto XGrad = context.Output<Tensor>(0);
+    auto bGrad = context.Output<Tensor>(1);
+    XGrad->mutable_data<T>(context.GetPlace());
+    bGrad->mutable_data<T>(context.GetPlace());
+
+    // I, O, OG  => [X, b], [Out], [OutGrad]
+    auto OutGrad = EigenMatrix<T>::From(*context.Input<Tensor>(3));
+    EigenMatrix<T>::From(*XGrad).device(*(context.GetEigenDevice<Place>())) =
+        OutGrad;
+    // const int dimension = bGrad.dimension(0);
+    // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
+    EigenVector<T>::Flatten(*bGrad).device(*(context.GetEigenDevice<Place>())) =
+        OutGrad.cumsum(1);  // colwise add
+  }
+};
 }  // namespace operators
 }  // namespace paddle

From 8ff3590eda2a6488f4b06f5ce6ffe553ae42d0a6 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Fri, 4 Aug 2017 01:15:56 +0800
Subject: [PATCH 02/17] fix op name

---
 paddle/operators/rowwise_add_op.cc | 20 ++++++++++----------
 paddle/operators/rowwise_add_op.cu |  2 +-
 paddle/operators/rowwise_add_op.h  |  6 +++---
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index cc763a8cf4..178ea3c614 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -16,7 +16,7 @@
 namespace paddle {
 namespace operators {
 
-class RowWiseAddOp : public OperatorWithKernel {
+class RowwiseAddOp : public OperatorWithKernel {
 protected:
   void InferShape(const InferShapeContext &ctx) const override {
     PADDLE_ENFORCE(ctx.InputSize() == 2UL,
@@ -32,9 +32,9 @@ protected:
   }
 };
 
-class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+class RowwiseAddOpMaker : public OpProtoAndCheckerMaker {
 public:
-  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  RowwiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The left input of row-wise add op, must be matrix");
     AddInput("b", "The right input of row-wise add op, must be vector");
@@ -46,13 +46,13 @@ for i in xrange(X.shape[0]):
 )DOC");
   }
 };
-class RowWiseAddGradOp : public OperatorWithKernel {
+class RowwiseAddGradOp : public OperatorWithKernel {
 protected:
   void InferShape(const InferShapeContext &ctx) const override {
     PADDLE_ENFORCE(ctx.InputSize() == 4UL,
-                   "RowWiseAddGrad inputs is I, O, OG, size must be 4");
+                   "RowwiseAddGrad inputs is I, O, OG, size must be 4");
     PADDLE_ENFORCE(ctx.OutputSize() == 2,
-                   "RowWiseAddGrad output is IG, size must be 2");
+                   "RowwiseAddGrad output is IG, size must be 2");
     ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
     ctx.Output<Tensor>(1)->Resize(ctx.Input<Tensor>(1)->dims());
   }
@@ -61,10 +61,10 @@ protected:
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(rowwise_add, ops::RowWiseAddOp, ops::RowWiseAddOpMaker);
+REGISTER_OP(rowwise_add, ops::RowwiseAddOp, ops::RowwiseAddOpMaker);
 REGISTER_OP_CPU_KERNEL(rowwise_add,
-                       ops::RowWiseAddKernel<ops::CPUPlace, float>);
+                       ops::RowwiseAddKernel<ops::CPUPlace, float>);
 
-REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, ops::RowWiseAddGradOp);
+REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, ops::RowwiseAddGradOp);
 REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
-                       ops::RowWiseAddGradKernel<ops::CPUPlace, float>);
+                       ops::RowwiseAddGradKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index 4b33e38eba..f48dfeb6f2 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -1,4 +1,4 @@
 #include "paddle/operators/rowwise_add_op.h"
 
 REGISTER_OP_GPU_KERNEL(rowwise_add,
-                       ops::RowWiseAddKernel<ops::GPUPlace, float>);
+                       ops::RowwiseAddKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 940459e0f1..321f51e61d 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class RowWiseAddKernel : public OpKernel {
+class RowwiseAddKernel : public OpKernel {
 public:
   void Compute(const ExecutionContext& context) const override {
     auto out = context.Output<Tensor>(0);
@@ -39,7 +39,7 @@ public:
 };
 
 template <typename Place, typename T>
-class RowWiseAddGradKernel : public OpKernel {
+class RowwiseAddGradKernel : public OpKernel {
 public:
   void Compute(const ExecutionContext& context) const override {
     auto XGrad = context.Output<Tensor>(0);
@@ -51,7 +51,7 @@ public:
     auto OutGrad = EigenMatrix<T>::From(*context.Input<Tensor>(3));
     EigenMatrix<T>::From(*XGrad).device(*(context.GetEigenDevice<Place>())) =
         OutGrad;
-    // const int dimension = bGrad.dimension(0);
+
     // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
     EigenVector<T>::Flatten(*bGrad).device(*(context.GetEigenDevice<Place>())) =
         OutGrad.cumsum(1);  // colwise add

From b7ee1e7d9c7f01844b23c54a3c5a2584e0a6a410 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 9 Aug 2017 00:12:09 +0800
Subject: [PATCH 03/17] "backward check todo"

---
 paddle/operators/rowwise_add_op.h                       | 8 ++++----
 python/paddle/v2/framework/tests/test_rowwise_add_op.py | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 06af88a993..965c0df532 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -42,18 +42,18 @@ template <typename Place, typename T>
 class RowwiseAddGradKernel : public OpKernel {
  public:
   void Compute(const ExecutionContext& context) const override {
-    auto XGrad = context.Output<Tensor>(0);
-    auto bGrad = context.Output<Tensor>(1);
+    auto* XGrad = context.Output<Tensor>(0);
+    auto* bGrad = context.Output<Tensor>(1);
     XGrad->mutable_data<T>(context.GetPlace());
     bGrad->mutable_data<T>(context.GetPlace());
 
     // I, O, OG  => [X, b], [Out], [OutGrad]
     auto OutGrad = EigenMatrix<T>::From(*context.Input<Tensor>(3));
-    EigenMatrix<T>::From(*XGrad).device(*(context.GetEigenDevice<Place>())) =
+    EigenMatrix<T>::From(*XGrad).device(context.GetEigenDevice<Place>()) =
         OutGrad;
 
     // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
-    EigenVector<T>::Flatten(*bGrad).device(*(context.GetEigenDevice<Place>())) =
+    EigenVector<T>::Flatten(*bGrad).device(context.GetEigenDevice<Place>()) =
         OutGrad.cumsum(1);  // colwise add
   }
 };
diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
index f8521eb517..e957dd6b3f 100644
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -15,5 +15,7 @@ class TestRowwiseAddOp(unittest.TestCase):
         self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
 
 
+#TODO(dzh): rowwise_grad check
+
 if __name__ == '__main__':
     unittest.main()

From 200e3e2c6b7b3c1be47204f0e76ab79696f46efb Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 9 Aug 2017 10:14:20 +0800
Subject: [PATCH 04/17] "change namespace prefix"

---
 paddle/operators/rowwise_add_op.cc | 6 +++---
 paddle/operators/rowwise_add_op.h  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index c192da04da..a012ab0be0 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -35,7 +35,7 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
 
 class RowwiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RowWiseAddOpMaker(framework::OpProto *proto,
+  RowwiseAddOpMaker(framework::OpProto *proto,
                     framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The left input of row-wise add op, must be matrix");
@@ -48,9 +48,9 @@ for i in xrange(X.shape[0]):
 )DOC");
   }
 };
-class RowwiseAddGradOp : public OperatorWithKernel {
+class RowwiseAddGradOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     PADDLE_ENFORCE(ctx.InputSize() == 4UL,
                    "RowwiseAddGrad inputs is I, O, OG, size must be 4");
     PADDLE_ENFORCE(ctx.OutputSize() == 2,
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index ad43e753e4..27d7a33e8a 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class RowwiseAddKernel : public OpKernel {
+class RowwiseAddKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto out = context.Output<Tensor>(0);

From 68bfc3ff963474e12c8af1c3575128b0acac90ed Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 9 Aug 2017 21:02:51 +0800
Subject: [PATCH 05/17] "add python test"

---
 .../v2/framework/tests/test_rowwise_add_op.py       | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
index e957dd6b3f..1b27f54f15 100644
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -1,6 +1,7 @@
 import unittest
-from op_test_util import OpTestMeta
 import numpy as np
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
 
 
 class TestRowwiseAddOp(unittest.TestCase):
@@ -15,6 +16,16 @@ class TestRowwiseAddOp(unittest.TestCase):
         self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
 
 
+class RowwiseAddGradOpTest(GradientChecker):
+    def test_rowwise_add(self):
+        op = create_op("rowwise_add")
+        inputs = {
+            "X": np.random.uniform(0.1, 1, [10, 10]).astype("float32"),
+            "b": np.random.uniform(0.1, 1, [10, 1]).astype("float32")
+        }
+        self.check_grad(op, inputs, set("X", "b"), "Out")
+
+
 #TODO(dzh): rowwise_grad check
 
 if __name__ == '__main__':

From 7c0cb0c7901093e7b2aa57100f086f737ab39739 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 9 Aug 2017 23:51:46 +0800
Subject: [PATCH 06/17] "fix ci launch"

---
 python/paddle/v2/framework/tests/test_rowwise_add_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
index 1b27f54f15..8118d2d741 100644
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -23,7 +23,7 @@ class RowwiseAddGradOpTest(GradientChecker):
             "X": np.random.uniform(0.1, 1, [10, 10]).astype("float32"),
             "b": np.random.uniform(0.1, 1, [10, 1]).astype("float32")
         }
-        self.check_grad(op, inputs, set("X", "b"), "Out")
+        self.check_grad(op, inputs, set(["X", "b"]), "Out")
 
 
 #TODO(dzh): rowwise_grad check

From 12ee5014857e751fb429e0d3ebcfd41dcd5da29d Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 14 Aug 2017 20:57:46 +0800
Subject: [PATCH 07/17] "fix operator grad config"

---
 paddle/operators/rowwise_add_op.cc | 23 +++++++++++++++++------
 paddle/operators/rowwise_add_op.h  | 21 +++++++++++----------
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 0c6ae64d0c..60e5d7749c 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -17,6 +17,8 @@
 namespace paddle {
 namespace operators {
 
+using framework::Tensor;
+
 class RowwiseAddOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -50,14 +52,23 @@ for i in xrange(X.shape[0]):
   }
 };
 class RowwiseAddGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(ctx.InputSize() == 4UL,
-                   "RowwiseAddGrad inputs is I, O, OG, size must be 4");
-    PADDLE_ENFORCE(ctx.OutputSize() == 2,
-                   "RowwiseAddGrad output is IG, size must be 2");
-    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
-    ctx.Output<Tensor>(1)->Resize(ctx.Input<Tensor>(1)->dims());
+    // PADDLE_ENFORCE(ctx.InputSize() == 4UL,
+    //                "RowwiseAddGrad inputs is I, O, OG, size must be 4");
+    // PADDLE_ENFORCE(ctx.OutputSize() == 2,
+    //                "RowwiseAddGrad output is IG, size must be 2");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "X should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("b"), "b should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+    auto dims0 = ctx.Input<Tensor>("X")->dims();
+    auto dims1 = ctx.Input<Tensor>("b")->dims();
+    ctx.Output<Tensor>(framework::GradVarName("X"))->Resize(dims0);
+    ctx.Output<Tensor>(framework::GradVarName("b"))->Resize(dims1);
   }
 };
 
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 3ad60172c1..6593d811e4 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -51,19 +51,20 @@ template <typename Place, typename T>
 class RowwiseAddGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* XGrad = context.Output<Tensor>(0);
-    auto* bGrad = context.Output<Tensor>(1);
-    XGrad->mutable_data<T>(context.GetPlace());
-    bGrad->mutable_data<T>(context.GetPlace());
+    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* db = context.Output<Tensor>(framework::GradVarName("b"));
+    auto* dOut = context.Output<Tensor>(framework::GradVarName("Out"));
+    dX->mutable_data<T>(context.GetPlace());
+    db->mutable_data<T>(context.GetPlace());
 
-    // I, O, OG  => [X, b], [Out], [OutGrad]
-    auto OutGrad = EigenMatrix<T>::From(*context.Input<Tensor>(3));
-    EigenMatrix<T>::From(*XGrad).device(context.GetEigenDevice<Place>()) =
-        OutGrad;
+    auto OutGrad = EigenMatrix<T>::From(*dOut);
+    auto place = context.GetEigenDevice<Place>();
+    EigenMatrix<T>::From(*dX).device(place) = OutGrad;
 
     // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
-    EigenVector<T>::Flatten(*bGrad).device(context.GetEigenDevice<Place>()) =
-        OutGrad.cumsum(1);  // colwise add
+    // colwise add
+    Eigen::array<int, 1> dims{{1}}; /* dimension to reduce */
+    EigenVector<T>::Flatten(*db).device(place) = OutGrad.sum(dims);
   }
 };
 }  // namespace operators

From e28e007373fca4faae6301f10b7c58e36153aec7 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 17 Aug 2017 18:41:23 +0800
Subject: [PATCH 08/17] Enable test_sgd_op

---
 python/paddle/v2/framework/tests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 96fad9b42e..faeac69513 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -25,3 +25,4 @@ py_test(test_operator SRCS test_operator.py)
 # py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
 py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
 py_test(test_recurrent_op SRCS test_recurrent_op.py)
+py_test(test_sgd_op SRCS test_sgd_op.py)

From a107181beae437705c561a245a102d7909d45d0d Mon Sep 17 00:00:00 2001
From: haonanyu <haonanyu@baidu.com>
Date: Thu, 17 Aug 2017 13:19:16 -0700
Subject: [PATCH 09/17] fix EXTERNAL_LIBS in CMakeLists.txt

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcd1218a5b..06dd5a1332 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,9 +137,9 @@ set(EXTERNAL_LIBS
 )
 
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
     if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
     endif(NOT WITH_DSO)
 endif(WITH_GPU)
 

From c332e4ee25ca28f307c1d3ccbcec9458fd25f5b3 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Thu, 17 Aug 2017 16:12:27 -0700
Subject: [PATCH 10/17] "relauch the ci"

---
 paddle/operators/rowwise_add_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 15192d90be..82e5df591d 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -73,7 +73,7 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(rowwise_add, ops::RowwiseAddOp, ops::RowwiseAddOpMaker,
-            rowwise_add_grad);
+            rowwise_add_grad, ops::RowwiseAddGradOp);
 REGISTER_OP_CPU_KERNEL(
     rowwise_add, ops::RowwiseAddKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(

From cef27dab47b430ce4034cfcfedf0c6bc95266f51 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Thu, 17 Aug 2017 19:14:27 -0700
Subject: [PATCH 11/17] "add fixl"

---
 paddle/operators/rowwise_add_op.cc                      | 1 +
 python/paddle/v2/framework/tests/test_rowwise_add_op.py | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 82e5df591d..f07dd8f602 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -63,6 +63,7 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
                             "Input(Out@GRAD) should not be null");
     auto dims0 = ctx.Input<Tensor>("X")->dims();
     auto dims1 = ctx.Input<Tensor>("b")->dims();
+    PADDLE_ENFORCE_EQ(1, framework::product(dims1), "b dims should be 1")
     ctx.Output<Tensor>(framework::GradVarName("X"))->Resize(dims0);
     ctx.Output<Tensor>(framework::GradVarName("b"))->Resize(dims1);
   }
diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
index 8118d2d741..29d72e8500 100644
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -21,12 +21,10 @@ class RowwiseAddGradOpTest(GradientChecker):
         op = create_op("rowwise_add")
         inputs = {
             "X": np.random.uniform(0.1, 1, [10, 10]).astype("float32"),
-            "b": np.random.uniform(0.1, 1, [10, 1]).astype("float32")
+            "b": np.random.uniform(0.1, 1, [10]).astype("float32")
         }
         self.check_grad(op, inputs, set(["X", "b"]), "Out")
 
 
-#TODO(dzh): rowwise_grad check
-
 if __name__ == '__main__':
     unittest.main()

From 8b3d33a055b2a1556adedeb41a16b794249a3848 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 17 Aug 2017 20:10:44 -0700
Subject: [PATCH 12/17] fix-sgd

---
 paddle/operators/sgd_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index bfb449d0b0..a0b5000ffb 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -30,7 +30,7 @@ class SGDOpKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto param = ctx.Input<Tensor>("param");
     auto grad = ctx.Input<Tensor>("grad");
-    auto param_out = ctx.Output<Tensor>(0);
+    auto param_out = ctx.Output<Tensor>("param_out");
     float lr = ctx.op_.GetAttr<float>("learning_rate");
 
     param_out->mutable_data<T>(ctx.GetPlace());

From 55437b58b9b91d543f3498c3913a75bfb1122d6f Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 18 Aug 2017 14:36:17 +0800
Subject: [PATCH 13/17] Add ENVIRONMENT interface interface

---
 paddle/memory/memory.cc             | 10 +++++
 paddle/memory/memory.h              |  1 -
 paddle/platform/CMakeLists.txt      |  3 +-
 paddle/platform/environment.h       | 59 +++++++++++++++++++++++++++++
 paddle/platform/environment_test.cc | 54 ++++++++++++++++++++++++++
 paddle/platform/gpu_info.cc         | 10 +++++
 paddle/platform/gpu_info.h          |  4 ++
 7 files changed, 139 insertions(+), 2 deletions(-)
 create mode 100644 paddle/platform/environment.h
 create mode 100644 paddle/platform/environment_test.cc

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index c99cc54156..0f46e1b8ea 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -18,8 +18,13 @@ limitations under the License. */
 #include <cstring>    // for memcpy
 #include <mutex>      // for call_once
 
+#include "glog/logging.h"
+
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/gpu_info.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
 
 namespace paddle {
 namespace memory {
@@ -79,6 +84,11 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
                                                  platform::GpuMinChunkSize(),
                                                  platform::GpuMaxChunkSize()));
     }
+    VLOG(3) << "\n\nNOTE: each GPU device use "
+            << FLAGS_fraction_of_gpu_memory_to_use * 100 << "% of GPU memory.\n"
+            << "You can set environment variable '"
+            << platform::kEnvFractionGpuMemoryToUse
+            << "' to change the fraction of GPU usage.\n\n";
   });
 
   platform::SetDeviceId(gpu_id);
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 72351b9dfa..11bbb88187 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index acfc063973..120eb1e4af 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
@@ -9,6 +9,7 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 add_subdirectory(dynload)
 
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)
+cc_test(environment_test SRCS environment_test.cc DEPS stringpiece)
 
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
diff --git a/paddle/platform/environment.h b/paddle/platform/environment.h
new file mode 100644
index 0000000000..b868de4892
--- /dev/null
+++ b/paddle/platform/environment.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdlib.h>
+#include <vector>
+
+#include "paddle/platform/enforce.h"
+#include "paddle/string/piece.h"
+
+extern char** environ;
+
+namespace paddle {
+namespace platform {
+
+inline void SetEnvVariable(const std::string& name, const std::string& value) {
+  PADDLE_ENFORCE_NE(setenv(name.c_str(), value.c_str(), 1), -1,
+                    "Failed to set environment variable %s=%s", name, value);
+}
+
+inline void UnsetEnvVariable(const std::string& name) {
+  PADDLE_ENFORCE_NE(unsetenv(name.c_str()), -1,
+                    "Failed to unset environment variable %s", name);
+}
+
+inline bool IsEnvVarDefined(const std::string& name) {
+  return std::getenv(name.c_str()) != nullptr;
+}
+
+inline std::string GetEnvValue(const std::string& name) {
+  PADDLE_ENFORCE(IsEnvVarDefined(name),
+                 "Tried to access undefined environment variable %s", name);
+  return std::getenv(name.c_str());
+}
+
+inline std::vector<std::string> GetAllEnvVariables() {
+  std::vector<std::string> vars;
+  for (auto var = environ; *var != nullptr; ++var) {
+    auto tail = string::Index(*var, "=");
+    auto name = string::SubStr(*var, 0, tail).ToString();
+    vars.push_back(name);
+  }
+  return vars;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/environment_test.cc b/paddle/platform/environment_test.cc
new file mode 100644
index 0000000000..5f13652721
--- /dev/null
+++ b/paddle/platform/environment_test.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/environment.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(ENVIRONMENT, ACCESS) {
+  namespace platform = paddle::platform;
+  namespace string = paddle::string;
+
+  platform::SetEnvVariable("PADDLE_USE_ENV", "TRUE");
+
+  EXPECT_TRUE(platform::IsEnvVarDefined("PADDLE_USE_ENV"));
+  EXPECT_EQ(platform::GetEnvValue("PADDLE_USE_ENV"), "TRUE");
+
+  platform::UnsetEnvVariable("PADDLE_USE_ENV");
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV"));
+
+  platform::SetEnvVariable("PADDLE_USE_ENV1", "Hello ");
+  platform::SetEnvVariable("PADDLE_USE_ENV2", "World, ");
+  platform::SetEnvVariable("PADDLE_USE_ENV3", "PaddlePaddle!");
+
+  std::string env_info;
+  auto vars = platform::GetAllEnvVariables();
+  for_each(vars.begin(), vars.end(), [&](const std::string& var) {
+    env_info += platform::GetEnvValue(var);
+  });
+
+  EXPECT_TRUE(string::Contains(env_info, "Hello World, PaddlePaddle!"));
+  platform::UnsetEnvVariable("PADDLE_USE_ENV1");
+  platform::UnsetEnvVariable("PADDLE_USE_ENV2");
+  platform::UnsetEnvVariable("PADDLE_USE_ENV3");
+
+  env_info.clear();
+  vars = platform::GetAllEnvVariables();
+  for_each(vars.begin(), vars.end(), [&](const std::string& var) {
+    env_info += platform::GetEnvValue(var);
+  });
+
+  EXPECT_FALSE(string::Contains(env_info, "Hello World, PaddlePaddle!"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV1"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV2"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV3"));
+}
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index edeb3ecd7b..be381a4e26 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/platform/gpu_info.h"
+
 #include "gflags/gflags.h"
+
 #include "paddle/platform/enforce.h"
+#include "paddle/platform/environment.h"
 
 DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
               "Default use 95% of GPU memory for PaddlePaddle,"
@@ -70,6 +73,13 @@ size_t GpuMaxChunkSize() {
 
   GpuMemoryUsage(available, total);
 
+  if (IsEnvVarDefined(kEnvFractionGpuMemoryToUse)) {
+    auto val = std::stod(GetEnvValue(kEnvFractionGpuMemoryToUse));
+    PADDLE_ENFORCE_GT(val, 0.0);
+    PADDLE_ENFORCE_LE(val, 1.0);
+    FLAGS_fraction_of_gpu_memory_to_use = val;
+  }
+
   // Reserving the rest memory for page tables, etc.
   size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
 
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index d3a5f5f13f..6a99838241 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -18,10 +18,14 @@ limitations under the License. */
 
 #include <cuda_runtime.h>
 #include <stddef.h>
+#include <string>
 
 namespace paddle {
 namespace platform {
 
+//! Environment variable: fraction of GPU memory to use on each device.
+const std::string kEnvFractionGpuMemoryToUse = "FRACTION_GPU_MEMORY_TO_USE";
+
 //! Get the total number of GPU devices in system.
 int GetDeviceCount();
 

From 3ec9ecb162d67b25adb7563381c9e693e3c6f306 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 18 Aug 2017 15:36:57 +0800
Subject: [PATCH 14/17] Fix  conflicts with new declaration with 'C' linkage

---
 paddle/platform/environment.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/environment.h b/paddle/platform/environment.h
index b868de4892..9ed7653525 100644
--- a/paddle/platform/environment.h
+++ b/paddle/platform/environment.h
@@ -15,13 +15,12 @@ limitations under the License. */
 #pragma once
 
 #include <stdlib.h>
+#include <unistd.h>
 #include <vector>
 
 #include "paddle/platform/enforce.h"
 #include "paddle/string/piece.h"
 
-extern char** environ;
-
 namespace paddle {
 namespace platform {
 
@@ -46,6 +45,7 @@ inline std::string GetEnvValue(const std::string& name) {
 }
 
 inline std::vector<std::string> GetAllEnvVariables() {
+  extern char** environ;
   std::vector<std::string> vars;
   for (auto var = environ; *var != nullptr; ++var) {
     auto tail = string::Index(*var, "=");

From 83d0016f54a79faa6cc8626283fd96eb3f704183 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 18 Aug 2017 15:49:10 +0800
Subject: [PATCH 15/17] Fix undefined reference

---
 paddle/platform/environment.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/platform/environment.h b/paddle/platform/environment.h
index 9ed7653525..4edcce932e 100644
--- a/paddle/platform/environment.h
+++ b/paddle/platform/environment.h
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/platform/enforce.h"
 #include "paddle/string/piece.h"
 
+extern char** environ;  // for environment variables
+
 namespace paddle {
 namespace platform {
 
@@ -45,7 +47,6 @@ inline std::string GetEnvValue(const std::string& name) {
 }
 
 inline std::vector<std::string> GetAllEnvVariables() {
-  extern char** environ;
   std::vector<std::string> vars;
   for (auto var = environ; *var != nullptr; ++var) {
     auto tail = string::Index(*var, "=");

From b3ab15a7abed52a7b70d74fd7b9642b2ca0ca7b1 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 18 Aug 2017 17:39:10 +0800
Subject: [PATCH 16/17] follow comments

---
 paddle/platform/gpu_info.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 6a99838241..ed2420b874 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -24,7 +24,8 @@ namespace paddle {
 namespace platform {
 
 //! Environment variable: fraction of GPU memory to use on each device.
-const std::string kEnvFractionGpuMemoryToUse = "FRACTION_GPU_MEMORY_TO_USE";
+const std::string kEnvFractionGpuMemoryToUse =
+    "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
 
 //! Get the total number of GPU devices in system.
 int GetDeviceCount();

From 82b820e97b90f21d7b46629bba72436a69e888e1 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Fri, 18 Aug 2017 08:21:56 -0700
Subject: [PATCH 17/17] fix rowwise_add_grad_op

---
 paddle/operators/rowwise_add_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index f07dd8f602..6825dce332 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -63,7 +63,7 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
                             "Input(Out@GRAD) should not be null");
     auto dims0 = ctx.Input<Tensor>("X")->dims();
     auto dims1 = ctx.Input<Tensor>("b")->dims();
-    PADDLE_ENFORCE_EQ(1, framework::product(dims1), "b dims should be 1")
+    PADDLE_ENFORCE_EQ(1, dims1.size(), "b dims should be 1")
     ctx.Output<Tensor>(framework::GradVarName("X"))->Resize(dims0);
     ctx.Output<Tensor>(framework::GradVarName("b"))->Resize(dims1);
   }