From 5ad9474bf7d2ad94578bd509957ae331cde36ab0 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 24 Jul 2017 10:36:10 +0800
Subject: [PATCH 01/51] add random op

---
 paddle/operators/CMakeLists.txt |  1 +
 paddle/operators/random_op.cc   | 46 +++++++++++++++++++++++++++++++++
 paddle/operators/random_op.cu   |  6 +++++
 paddle/operators/random_op.h    | 29 +++++++++++++++++++++
 4 files changed, 82 insertions(+)
 create mode 100644 paddle/operators/random_op.cc
 create mode 100644 paddle/operators/random_op.cu
 create mode 100644 paddle/operators/random_op.h
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index a37720e509..14f8303c40 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -48,6 +48,7 @@ op_library(mul_op SRCS mul_op.cc mul_op.cu)
 op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
 op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc)
 op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
+op_library(random_op SRCS random_op.cc random_op.cu)
 
 op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op
         softmax_op net)
diff --git a/paddle/operators/random_op.cc b/paddle/operators/random_op.cc
new file mode 100644
index 0000000000..c219a0b67d
--- /dev/null
+++ b/paddle/operators/random_op.cc
@@ -0,0 +1,46 @@
+#include "paddle/operators/random_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+class RandomOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor*>& inputs,
+      const std::vector<framework::Tensor*>& outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 0, "Input size of RandomOp must be zero.");
+    PADDLE_ENFORCE(outputs.size() == 1, "Output size of RandomOp must be one.");
+    PADDLE_ENFORCE(inputs[0] != nullptr && outputs[0] != nullptr,
+                   "Inputs/Outputs of RandomOp must all be set.");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class RandomOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  RandomOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("Shape", "The shape of matrix to be randomized");
+    AddAttr<float>("seed", "random seed generator.").SetDefault(1337);
+    AddAttr<float>("mean", "mean value of random.").SetDefault(.0);
+    AddAttr<float>("std", "minimum value of random value")
+        .SetDefault(1.0)
+        .LargerThan(.0);
+    AddOutput("Out", "output matrix of random op");
+    AddComment(R"DOC(
+Random Operator fill a matrix in normal distribution.
+The eqution : Out = Random(Shape=(d0, d1, ...), Dtype, mean, std)
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(random_op,
+            paddle::operators::RandomOp,
+            paddle::operators::RandomOpMaker);
+
+typedef paddle::operators::RandomOpKernel<paddle::platform::CPUPlace, float>
+    RandomOpKernel_CPU_float;
+REGISTER_OP_CPU_KERNEL(random_op, RandomOpKernel_CPU_float);
diff --git a/paddle/operators/random_op.cu b/paddle/operators/random_op.cu
new file mode 100644
index 0000000000..50985f6699
--- /dev/null
+++ b/paddle/operators/random_op.cu
@@ -0,0 +1,6 @@
+#include "paddle/operators/random_op.h"
+#include "paddle/framework/op_registry.h"
+
+typedef paddle::operators::RandomOpKernel<paddle::platform::GPUPlace, float>
+  RandomOpKernel_GPU_float;
+REGISTER_OP_GPU_KERNEL(random_op, RandomOpKernel_GPU_float);
\ No newline at end of file
diff --git a/paddle/operators/random_op.h b/paddle/operators/random_op.h
new file mode 100644
index 0000000000..123d9d6ffa
--- /dev/null
+++ b/paddle/operators/random_op.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <random>
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class RandomOpKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext& context) const override {
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+    output->mutable_data<T>(context.GetPlace());
+
+    auto shape = context.op_.attrs_.at("Shape");
+    auto mean = context.op_.attrs_.at("mean");
+    auto std = context.op_.attrs_.at("std");
+    auto seed = context.op_.attrs_.at("seed");
+    // std::default_random_engine generator(seed);
+    // std::normal_distribution<double> distribution(mean, std);
+
+    framework::EigenMatrix<T>::From(*output).device(*(
+        context.GetEigenDevice<Place>())) = framework::EigenMatrix<T>::Random();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

From 0d554f1dea499e72ce0e0d6c240aac0add23cf49 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 24 Jul 2017 21:01:57 +0800
Subject: [PATCH 02/51] "add template fill function"

---
 paddle/operators/random_op.cc | 14 +++++++++++-
 paddle/operators/random_op.cu | 13 ++++++++++++
 paddle/operators/random_op.h  | 40 +++++++++++++++++++++++++++++------
 3 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/paddle/operators/random_op.cc b/paddle/operators/random_op.cc
index c219a0b67d..b85ff84220 100644
--- a/paddle/operators/random_op.cc
+++ b/paddle/operators/random_op.cc
@@ -3,6 +3,18 @@
 
 namespace paddle {
 namespace operators {
+
+using paddle::platform::GPUPlace;
+template <GPUPlace, typename T, typename Generator>
+bool Gaussian(
+    Generator g, T* output, const int size, const T& mean, const T& std) {
+  std::normal_distribution<double> distribution(mean, std);
+  for (int i = 0; i < size; ++i) {
+    output[i] = distribution(g());
+  }
+  return true;
+}
+
 class RandomOp : public framework::OperatorWithKernel {
 protected:
   void InferShape(
@@ -12,7 +24,7 @@ protected:
     PADDLE_ENFORCE(outputs.size() == 1, "Output size of RandomOp must be one.");
     PADDLE_ENFORCE(inputs[0] != nullptr && outputs[0] != nullptr,
                    "Inputs/Outputs of RandomOp must all be set.");
-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->set_dims(context.op_.attrs_.at("shape"));
   }
 };
 
diff --git a/paddle/operators/random_op.cu b/paddle/operators/random_op.cu
index 50985f6699..ea1096aeb9 100644
--- a/paddle/operators/random_op.cu
+++ b/paddle/operators/random_op.cu
@@ -1,6 +1,19 @@
 #include "paddle/operators/random_op.h"
 #include "paddle/framework/op_registry.h"
 
+namespace paddle {
+namespace operators {
+
+using paddle::platform::GPUPlace;
+template<GPUPlace, typename T, typename Generator>
+bool Gaussian(Generator g, T* output, const int size, const T& mean, const T& std) {
+  return curandGenerateNormal(g, output, size, mean, std);
+}
+
+} // operators
+} // paddle
+
+
 typedef paddle::operators::RandomOpKernel<paddle::platform::GPUPlace, float>
   RandomOpKernel_GPU_float;
 REGISTER_OP_GPU_KERNEL(random_op, RandomOpKernel_GPU_float);
\ No newline at end of file
diff --git a/paddle/operators/random_op.h b/paddle/operators/random_op.h
index 123d9d6ffa..1b5fb16de1 100644
--- a/paddle/operators/random_op.h
+++ b/paddle/operators/random_op.h
@@ -6,24 +6,52 @@
 
 namespace paddle {
 namespace operators {
+template <typename Place, typename T, typename Generator>
+bool Gaussian(
+    Generator g, T* output, const int size, const T& mean, const T& std);
+
 template <typename Place, typename T>
 class RandomOpKernel : public framework::OpKernel {
 public:
   void Compute(const framework::KernelContext& context) const override {
-    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
-    output->mutable_data<T>(context.GetPlace());
-
-    auto shape = context.op_.attrs_.at("Shape");
     auto mean = context.op_.attrs_.at("mean");
     auto std = context.op_.attrs_.at("std");
     auto seed = context.op_.attrs_.at("seed");
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+    output->mutable_data<T>(context.GetPlace());
+
+    Gaussian<Place, T, >(, output, output->size(), mean, std) :
     // std::default_random_engine generator(seed);
     // std::normal_distribution<double> distribution(mean, std);
 
-    framework::EigenMatrix<T>::From(*output).device(*(
-        context.GetEigenDevice<Place>())) = framework::EigenMatrix<T>::Random();
+    // framework::EigenMatrix<T>::From(*output).device(*(
+    //     context.GetEigenDevice<Place>())) =
+    //     framework::EigenMatrix<T>::Random();
   }
 };
 
+// using paddle::platform::CPUPlace;
+// template<CPUPlace, typename T>
+// class RandomOpKernel : public framework::OpKernel {
+// public:
+//   void Compute(const framework::KernelContext& context) const override {
+
+//     std::unique_ptr<default_random_engine> generator(seed);
+//     for(size_t i=0; i < output->size(); ++i) {
+//       output[i] = distribution(generator());
+//     }
+//   }
+
+// };
+
+// using paddle::platform::GPUPlace;
+// template<GPUPlace, typename T>
+// class RandomOpKernel : public framework::OpKernel {
+// public:
+//   void Compute(const framework::KernelContext& context) const override {
+
+//   }
+// }
+
 }  // namespace operators
 }  // namespace paddle

From 6f80b5f1df2b4d77857338f44c3159388602457b Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 25 Jul 2017 12:00:47 +0800
Subject: [PATCH 03/51] "move to template function"

---
 paddle/operators/random_op.cc    | 34 ++++++++++++++++++-----
 paddle/operators/random_op.cu    |  7 ++---
 paddle/operators/random_op.h     | 28 +++++++++++++------
 paddle/platform/device_context.h | 46 ++++++++++++++++++++------------
 4 files changed, 81 insertions(+), 34 deletions(-)

diff --git a/paddle/operators/random_op.cc b/paddle/operators/random_op.cc
index b85ff84220..a536ee74b4 100644
--- a/paddle/operators/random_op.cc
+++ b/paddle/operators/random_op.cc
@@ -1,13 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/operators/random_op.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-using paddle::platform::GPUPlace;
-template <GPUPlace, typename T, typename Generator>
-bool Gaussian(
-    Generator g, T* output, const int size, const T& mean, const T& std) {
+// using paddle::platform::CPUPlace;
+// template <paddle::platform::CPUPlace, typename T, typename DeviceContext>
+template <typename T>
+bool Gaussian(platform::CPUDeviceContext& ctx,
+              framework::Tensor* output,
+              const int size,
+              const T& mean,
+              const T& std,
+              const T& seed) {
+  auto g = ctx.RandGenerator(seed);
   std::normal_distribution<double> distribution(mean, std);
   for (int i = 0; i < size; ++i) {
     output[i] = distribution(g());
@@ -24,7 +44,9 @@ protected:
     PADDLE_ENFORCE(outputs.size() == 1, "Output size of RandomOp must be one.");
     PADDLE_ENFORCE(inputs[0] != nullptr && outputs[0] != nullptr,
                    "Inputs/Outputs of RandomOp must all be set.");
-    outputs[0]->set_dims(context.op_.attrs_.at("shape"));
+    outputs[0]->Resize(
+        framework::make_ddim(this->GetAttr<std::vector<int>>("shape")));
+    // outputs[0]->set_dims(context.op_.attrs_.at("shape"));
   }
 };
 
@@ -32,7 +54,7 @@ class RandomOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
   RandomOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("Shape", "The shape of matrix to be randomized");
+    AddAttr<std::vector<int>>("shape", "The shape of matrix to be randomized");
     AddAttr<float>("seed", "random seed generator.").SetDefault(1337);
     AddAttr<float>("mean", "mean value of random.").SetDefault(.0);
     AddAttr<float>("std", "minimum value of random value")
diff --git a/paddle/operators/random_op.cu b/paddle/operators/random_op.cu
index ea1096aeb9..40b642d8a1 100644
--- a/paddle/operators/random_op.cu
+++ b/paddle/operators/random_op.cu
@@ -4,9 +4,10 @@
 namespace paddle {
 namespace operators {
 
-using paddle::platform::GPUPlace;
-template<GPUPlace, typename T, typename Generator>
-bool Gaussian(Generator g, T* output, const int size, const T& mean, const T& std) {
+template <typename T>
+bool Gaussian(platform::CUDADeviceContext &ctx, framework::Tensor* output,
+              const int size, const T& mean, const T& std, const T& seed) {
+  auto g = RandGenerator(seed);
   return curandGenerateNormal(g, output, size, mean, std);
 }
 
diff --git a/paddle/operators/random_op.h b/paddle/operators/random_op.h
index 1b5fb16de1..a82b3afec8 100644
--- a/paddle/operators/random_op.h
+++ b/paddle/operators/random_op.h
@@ -6,21 +6,33 @@
 
 namespace paddle {
 namespace operators {
-template <typename Place, typename T, typename Generator>
-bool Gaussian(
-    Generator g, T* output, const int size, const T& mean, const T& std);
+template <typename T, typename DeviceContext>
+bool Gaussian(DeviceContext& ctx,
+              framework::Tensor* output,
+              const int size,
+              const T& mean,
+              const T& std,
+              const T& seed);
 
 template <typename Place, typename T>
 class RandomOpKernel : public framework::OpKernel {
 public:
   void Compute(const framework::KernelContext& context) const override {
-    auto mean = context.op_.attrs_.at("mean");
-    auto std = context.op_.attrs_.at("std");
-    auto seed = context.op_.attrs_.at("seed");
+    auto mean = context.op_.GetAttr<T>("mean");
+    auto std = context.op_.GetAttr<T>("std");
+    auto seed = context.op_.GetAttr<T>("seed");
     auto* output = context.Output(0)->GetMutable<framework::Tensor>();
     output->mutable_data<T>(context.GetPlace());
-
-    Gaussian<Place, T, >(, output, output->size(), mean, std) :
+    Gaussian(context.device_context_,
+             output,
+             framework::product(output->dims()),
+             mean,
+             std,
+             seed);
+    // Gaussian<T, const platform::DeviceContext>(context.device_context_,
+    // output,
+    //                                            framework::product(output->dims()),
+    //                                            mean, std, seed);
     // std::default_random_engine generator(seed);
     // std::normal_distribution<double> distribution(mean, std);
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index fe6f13e399..b8af4abd7f 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -39,6 +39,7 @@ class DeviceContext {
 
 class CPUDeviceContext : public DeviceContext {
  public:
+  typedef std::mt19937 random_generator_type;
   CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); }
 
   Eigen::DefaultDevice* eigen_device() const { return eigen_device_.get(); }
@@ -48,7 +49,17 @@ class CPUDeviceContext : public DeviceContext {
     return retv;
   }
 
+  const random_generator_type& RandGenerator(const int seed) {
+    if (!rand_generator_) {
+      random_seed_ = seed;
+      rand_generator_.reset(new random_generator_type(random_seed_));
+    }
+    return *rand_generator_.get();
+  }
+
  private:
+  int random_seed_;
+  std::unique_ptr<random_generator_type> rand_generator_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
@@ -87,6 +98,24 @@ class CUDADeviceContext : public DeviceContext {
                    "cudaStreamSynchronize failed");
   }
 
+  const curandGenerator_t RandGenerator(const int seed) {
+    if (!rand_generator_) {
+      random_seed_ = seed;
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
+                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT),
+                     "curandCreateGenerator failed");
+      PADDLE_ENFORCE(
+          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
+              rand_generator_, random_seed_),
+          "curandSetPseudoRandomGeneratorSeed failed");
+      PADDLE_ENFORCE(
+          paddle::platform::dynload::curandSetStream(rand_generator_, stream_),
+          "curandSetStream failed");
+    }
+    return rand_generator_;
+  }
+
   cudaStream_t stream() { return stream_; }
 
   Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); }
@@ -115,23 +144,6 @@ class CUDADeviceContext : public DeviceContext {
     return dnn_handle_;
   }
 
-  curandGenerator_t curand_generator() {
-    if (!rand_generator_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
-                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT),
-                     "curandCreateGenerator failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
-              rand_generator_, random_seed_),
-          "curandSetPseudoRandomGeneratorSeed failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandSetStream(rand_generator_, stream_),
-          "curandSetStream failed");
-    }
-    return rand_generator_;
-  }
-
   ~CUDADeviceContext() {
     Wait();
     if (blas_handle_) {

From 32c15a291917786ba136b76adb289aaa78527252 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 25 Jul 2017 15:00:24 +0800
Subject: [PATCH 04/51] "random op test"

---
 paddle/operators/random_op.h                  |  1 +
 paddle/pybind/pybind.cc                       |  1 +
 .../v2/framework/tests/test_plain_net.py      | 12 ++++----
 .../v2/framework/tests/test_random_op.py      | 28 +++++++++++++++++++
 4 files changed, 36 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_random_op.py

diff --git a/paddle/operators/random_op.h b/paddle/operators/random_op.h
index a82b3afec8..bee6cc9cbd 100644
--- a/paddle/operators/random_op.h
+++ b/paddle/operators/random_op.h
@@ -6,6 +6,7 @@
 
 namespace paddle {
 namespace operators {
+
 template <typename T, typename DeviceContext>
 bool Gaussian(DeviceContext& ctx,
               framework::Tensor* output,
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index d48a948d21..f6e9013471 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -36,6 +36,7 @@ USE_OP(mul);
 USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
+USE_OP(random_op);
 
 template <typename ClassType>
 void ExposeOperator(ClassType& m) {
diff --git a/python/paddle/v2/framework/tests/test_plain_net.py b/python/paddle/v2/framework/tests/test_plain_net.py
index 2b919aca28..53c8dd6c22 100644
--- a/python/paddle/v2/framework/tests/test_plain_net.py
+++ b/python/paddle/v2/framework/tests/test_plain_net.py
@@ -16,13 +16,13 @@ class TestNet(unittest.TestCase):
         net.complete_add_op(True)
 
         expected = '''
-Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
-    Op(add_two), inputs:(X, Y), outputs:(Out).
-    Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
+        Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
+        Op(add_two), inputs:(X, Y), outputs:(Out).
+        Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
         Op(fc), inputs:(X, w, @EMPTY@), outputs:(fc.out, @TEMP@fc@0).
-            Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
-            Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
-'''
+        Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
+        Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
+        '''
         self.assertEqual(expected, "\n" + str(net))
 
 
diff --git a/python/paddle/v2/framework/tests/test_random_op.py b/python/paddle/v2/framework/tests/test_random_op.py
new file mode 100644
index 0000000000..eb69f35edf
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_random_op.py
@@ -0,0 +1,28 @@
+import unittest
+import paddle.v2.framework.create_op_creation_methods as creation
+import paddle.v2.framework.core as core
+from op_test_util import OpTestMeta
+import numpy
+
+
+class TestRandomOp(unittest.TestCase):
+    def test_random(self):
+        scope = core.Scope(None)
+        # Out = scope.create_var("Out")
+        op = creation.op_creations.random(
+            shape=[1000, 1000], mean=5.0, std=1.0, seed=1701, Out="Out")
+        for out in op.outputs():
+            if scope.get_var(out) is None:
+                scope.create_var(out).get_tensor()
+
+        tensor = scope.get_var("Y").get_tensor()
+        op.infer_shape(scope)
+        self.assertEqual([1000, 1000], tensor.shape())
+        ctx = core.DeviceContext.cpu_context()
+        op.run(scope, ctx)
+        self.assertAlmostEqual(numpy.std(tensor), 1.0)
+        self.assertAlmostEqual(numpy.mean(tensor), 5.0)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 30a47fe8a321fd92b05fcf71e3668862176d1f91 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 25 Jul 2017 15:19:13 +0800
Subject: [PATCH 05/51] "link pybind11"

---
 paddle/operators/random_op.cc | 4 ++--
 paddle/operators/random_op.cu | 2 +-
 paddle/pybind/CMakeLists.txt  | 2 +-
 paddle/pybind/pybind.cc       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/operators/random_op.cc b/paddle/operators/random_op.cc
index a536ee74b4..05a3dbd9f4 100644
--- a/paddle/operators/random_op.cc
+++ b/paddle/operators/random_op.cc
@@ -71,10 +71,10 @@ The eqution : Out = Random(Shape=(d0, d1, ...), Dtype, mean, std)
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(random_op,
+REGISTER_OP(random,
             paddle::operators::RandomOp,
             paddle::operators::RandomOpMaker);
 
 typedef paddle::operators::RandomOpKernel<paddle::platform::CPUPlace, float>
     RandomOpKernel_CPU_float;
-REGISTER_OP_CPU_KERNEL(random_op, RandomOpKernel_CPU_float);
+REGISTER_OP_CPU_KERNEL(random, RandomOpKernel_CPU_float);
diff --git a/paddle/operators/random_op.cu b/paddle/operators/random_op.cu
index 40b642d8a1..85054974ac 100644
--- a/paddle/operators/random_op.cu
+++ b/paddle/operators/random_op.cu
@@ -17,4 +17,4 @@ bool Gaussian(platform::CUDADeviceContext &ctx, framework::Tensor* output,
 
 typedef paddle::operators::RandomOpKernel<paddle::platform::GPUPlace, float>
   RandomOpKernel_GPU_float;
-REGISTER_OP_GPU_KERNEL(random_op, RandomOpKernel_GPU_float);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(random, RandomOpKernel_GPU_float);
\ No newline at end of file
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index fd1a142b40..8010369b41 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python
-        add_op fc_op sgd_op cross_entropy_op)
+        add_op fc_op sgd_op cross_entropy_op random_op)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index f6e9013471..1138b07ec4 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -36,7 +36,7 @@ USE_OP(mul);
 USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
-USE_OP(random_op);
+USE_OP(random);
 
 template <typename ClassType>
 void ExposeOperator(ClassType& m) {

From 2b3e362136f94a20503cdb1133762e33b18bf6f3 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 25 Jul 2017 15:50:51 +0800
Subject: [PATCH 06/51] "template specialization link include"

---
 paddle/operators/random_op.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/paddle/operators/random_op.h b/paddle/operators/random_op.h
index bee6cc9cbd..3eeb1f87c8 100644
--- a/paddle/operators/random_op.h
+++ b/paddle/operators/random_op.h
@@ -15,6 +15,21 @@ bool Gaussian(DeviceContext& ctx,
               const T& std,
               const T& seed);
 
+template <typename T>
+bool Gaussian(platform::CPUDeviceContext& ctx,
+              framework::Tensor* output,
+              const int size,
+              const T& mean,
+              const T& std,
+              const T& seed);
+template <typename T>
+bool Gaussian(platform::CUDADeviceContext& ctx,
+              framework::Tensor* output,
+              const int size,
+              const T& mean,
+              const T& std,
+              const T& seed);
+
 template <typename Place, typename T>
 class RandomOpKernel : public framework::OpKernel {
 public:

From 984225ecf198525a134acbda0fb6cab177a59ebd Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 25 Jul 2017 16:07:08 +0800
Subject: [PATCH 07/51] "fix operator"

---
 paddle/framework/operator.cc                  | 14 ++++-
 paddle/operators/random_op.cc                 | 23 ++------
 paddle/operators/random_op.cu                 | 13 -----
 paddle/operators/random_op.h                  | 54 +++++++------------
 .../paddle/v2/framework/tests/CMakeLists.txt  |  3 +-
 .../v2/framework/tests/test_random_op.py      |  7 +--
 6 files changed, 39 insertions(+), 75 deletions(-)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 1e57e9a20f..18e327089f 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-
 #include "paddle/framework/operator.h"
+#include <algorithm>
+#include <iterator>
 
 namespace paddle {
 namespace framework {
@@ -95,6 +95,16 @@ std::string OperatorBase::DebugString() const {
       ss << ", ";
     }
   }
+  ss << "), ";
+  ss << "Attrs:(";
+  size_t i = 0;
+  for (auto& attr : attrs_) {
+    ss << attr.first;
+    if (i != attrs_.size() - 1) {
+      ss << ", ";
+    }
+    i++;
+  }
   ss << ").";
   return ss.str();
 }
diff --git a/paddle/operators/random_op.cc b/paddle/operators/random_op.cc
index 05a3dbd9f4..726f6504e7 100644
--- a/paddle/operators/random_op.cc
+++ b/paddle/operators/random_op.cc
@@ -13,28 +13,12 @@
    limitations under the License. */
 
 #include "paddle/operators/random_op.h"
+#include "glog/logging.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-// using paddle::platform::CPUPlace;
-// template <paddle::platform::CPUPlace, typename T, typename DeviceContext>
-template <typename T>
-bool Gaussian(platform::CPUDeviceContext& ctx,
-              framework::Tensor* output,
-              const int size,
-              const T& mean,
-              const T& std,
-              const T& seed) {
-  auto g = ctx.RandGenerator(seed);
-  std::normal_distribution<double> distribution(mean, std);
-  for (int i = 0; i < size; ++i) {
-    output[i] = distribution(g());
-  }
-  return true;
-}
-
 class RandomOp : public framework::OperatorWithKernel {
 protected:
   void InferShape(
@@ -42,11 +26,10 @@ protected:
       const std::vector<framework::Tensor*>& outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 0, "Input size of RandomOp must be zero.");
     PADDLE_ENFORCE(outputs.size() == 1, "Output size of RandomOp must be one.");
-    PADDLE_ENFORCE(inputs[0] != nullptr && outputs[0] != nullptr,
-                   "Inputs/Outputs of RandomOp must all be set.");
+    PADDLE_ENFORCE(outputs[0] != nullptr,
+                   "Outputs of RandomOp must all be set.");
     outputs[0]->Resize(
         framework::make_ddim(this->GetAttr<std::vector<int>>("shape")));
-    // outputs[0]->set_dims(context.op_.attrs_.at("shape"));
   }
 };
 
diff --git a/paddle/operators/random_op.cu b/paddle/operators/random_op.cu
index 85054974ac..b417666c98 100644
--- a/paddle/operators/random_op.cu
+++ b/paddle/operators/random_op.cu
@@ -1,19 +1,6 @@
 #include "paddle/operators/random_op.h"
 #include "paddle/framework/op_registry.h"
 
-namespace paddle {
-namespace operators {
-
-template <typename T>
-bool Gaussian(platform::CUDADeviceContext &ctx, framework::Tensor* output,
-              const int size, const T& mean, const T& std, const T& seed) {
-  auto g = RandGenerator(seed);
-  return curandGenerateNormal(g, output, size, mean, std);
-}
-
-} // operators
-} // paddle
-
 
 typedef paddle::operators::RandomOpKernel<paddle::platform::GPUPlace, float>
   RandomOpKernel_GPU_float;
diff --git a/paddle/operators/random_op.h b/paddle/operators/random_op.h
index 3eeb1f87c8..f8e1a90a1d 100644
--- a/paddle/operators/random_op.h
+++ b/paddle/operators/random_op.h
@@ -13,7 +13,9 @@ bool Gaussian(DeviceContext& ctx,
               const int size,
               const T& mean,
               const T& std,
-              const T& seed);
+              const T& seed) {
+  return false;
+}
 
 template <typename T>
 bool Gaussian(platform::CPUDeviceContext& ctx,
@@ -21,14 +23,27 @@ bool Gaussian(platform::CPUDeviceContext& ctx,
               const int size,
               const T& mean,
               const T& std,
-              const T& seed);
+              const T& seed) {
+  auto g = ctx.RandGenerator(seed);
+  std::normal_distribution<double> distribution(mean, std);
+  for (int i = 0; i < size; ++i) {
+    output[i] = distribution(g);
+  }
+  return true;
+}
+
+#ifndef PADDLE_ONLY_CPU
 template <typename T>
 bool Gaussian(platform::CUDADeviceContext& ctx,
               framework::Tensor* output,
               const int size,
               const T& mean,
               const T& std,
-              const T& seed);
+              const T& seed) {
+  auto g = RandGenerator(seed);
+  return curandGenerateNormal(g, output, size, mean, std);
+}
+#endif
 
 template <typename Place, typename T>
 class RandomOpKernel : public framework::OpKernel {
@@ -45,41 +60,8 @@ public:
              mean,
              std,
              seed);
-    // Gaussian<T, const platform::DeviceContext>(context.device_context_,
-    // output,
-    //                                            framework::product(output->dims()),
-    //                                            mean, std, seed);
-    // std::default_random_engine generator(seed);
-    // std::normal_distribution<double> distribution(mean, std);
-
-    // framework::EigenMatrix<T>::From(*output).device(*(
-    //     context.GetEigenDevice<Place>())) =
-    //     framework::EigenMatrix<T>::Random();
   }
 };
 
-// using paddle::platform::CPUPlace;
-// template<CPUPlace, typename T>
-// class RandomOpKernel : public framework::OpKernel {
-// public:
-//   void Compute(const framework::KernelContext& context) const override {
-
-//     std::unique_ptr<default_random_engine> generator(seed);
-//     for(size_t i=0; i < output->size(); ++i) {
-//       output[i] = distribution(generator());
-//     }
-//   }
-
-// };
-
-// using paddle::platform::GPUPlace;
-// template<GPUPlace, typename T>
-// class RandomOpKernel : public framework::OpKernel {
-// public:
-//   void Compute(const framework::KernelContext& context) const override {
-
-//   }
-// }
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index b3eb2ef8a8..254e8d37d1 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -12,4 +12,5 @@ add_python_test(test_framework
     test_mul_op.py
     test_sigmoid_op.py
     test_softmax_op.py
-    test_rowwise_add_op.py)
+    test_rowwise_add_op.py
+    test_random_op.py)
diff --git a/python/paddle/v2/framework/tests/test_random_op.py b/python/paddle/v2/framework/tests/test_random_op.py
index eb69f35edf..e2aa9bdfc2 100644
--- a/python/paddle/v2/framework/tests/test_random_op.py
+++ b/python/paddle/v2/framework/tests/test_random_op.py
@@ -15,13 +15,14 @@ class TestRandomOp(unittest.TestCase):
             if scope.get_var(out) is None:
                 scope.create_var(out).get_tensor()
 
-        tensor = scope.get_var("Y").get_tensor()
+        tensor = scope.get_var("Out").get_tensor()
         op.infer_shape(scope)
         self.assertEqual([1000, 1000], tensor.shape())
         ctx = core.DeviceContext.cpu_context()
         op.run(scope, ctx)
-        self.assertAlmostEqual(numpy.std(tensor), 1.0)
-        self.assertAlmostEqual(numpy.mean(tensor), 5.0)
+        tensor_array = numpy.array(tensor)
+        self.assertAlmostEqual(numpy.std(tensor_array), 1.0)
+        self.assertAlmostEqual(numpy.mean(tensor_array), 5.0)
 
 
 if __name__ == '__main__':

From 11f9f5fb172f620d5221c93fe26196ebd244df79 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 26 Jul 2017 00:40:37 +0800
Subject: [PATCH 08/51] "fix const dependency hell"

---
 paddle/framework/operator.cc     |  4 +--
 paddle/framework/operator.h      | 14 ++++-----
 paddle/operators/random_op.h     | 49 ++++++++++++++++----------------
 paddle/platform/device_context.h |  4 +--
 4 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 18e327089f..0a317dffa9 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice* KernelContext::GetEigenDevice<
     platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return device_context_->get_eigen_device<Eigen::DefaultDevice>();
 }
 
 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice*
 KernelContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return device_context_->get_eigen_device<Eigen::GpuDevice>();
 }
 #endif
 
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index f59314f828..5db041ea32 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -88,7 +88,7 @@ class OperatorBase {
 
   /// Net will call this function to Run an op.
   virtual void Run(const std::shared_ptr<Scope>& scope,
-                   const platform::DeviceContext& dev_ctx) const = 0;
+                   platform::DeviceContext& dev_ctx) const = 0;
 
   // Get a input with argument's name described in `op_proto`
   const std::string& Input(const std::string& name) const;
@@ -113,8 +113,8 @@ class OperatorBase {
 class KernelContext {
  public:
   KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
-                const platform::DeviceContext& device_context)
-      : op_(*op), scope_(scope), device_context_(device_context) {}
+                platform::DeviceContext& device_context)
+      : op_(*op), scope_(scope), device_context_(&device_context) {}
 
   const Variable* Input(int index) const {
     return scope_->GetVariable(op_.inputs_[index]);
@@ -155,11 +155,11 @@ class KernelContext {
                 typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
   DeviceType* GetEigenDevice() const;
 
-  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+  platform::Place GetPlace() const { return device_context_->GetPlace(); }
 
   const OperatorBase& op_;
-  const std::shared_ptr<Scope>& scope_;
-  const platform::DeviceContext& device_context_;
+  const std::shared_ptr<Scope> scope_;
+  platform::DeviceContext* device_context_;
 };
 
 class OpKernel {
@@ -213,7 +213,7 @@ class OperatorWithKernel : public OperatorBase {
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
   void Run(const std::shared_ptr<Scope>& scope,
-           const platform::DeviceContext& dev_ctx) const final {
+           platform::DeviceContext& dev_ctx) const final {
     auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
     opKernel->Compute(KernelContext(this, scope, dev_ctx));
   }
diff --git a/paddle/operators/random_op.h b/paddle/operators/random_op.h
index f8e1a90a1d..8231b6b613 100644
--- a/paddle/operators/random_op.h
+++ b/paddle/operators/random_op.h
@@ -7,25 +7,15 @@
 namespace paddle {
 namespace operators {
 
-template <typename T, typename DeviceContext>
-bool Gaussian(DeviceContext& ctx,
-              framework::Tensor* output,
-              const int size,
-              const T& mean,
-              const T& std,
-              const T& seed) {
-  return false;
-}
-
 template <typename T>
-bool Gaussian(platform::CPUDeviceContext& ctx,
-              framework::Tensor* output,
+bool Gaussian(platform::CPUDeviceContext* ctx,
+              T* output,
               const int size,
               const T& mean,
               const T& std,
               const T& seed) {
-  auto g = ctx.RandGenerator(seed);
-  std::normal_distribution<double> distribution(mean, std);
+  auto g = ctx->RandGenerator(seed);
+  std::normal_distribution<T> distribution(mean, std);
   for (int i = 0; i < size; ++i) {
     output[i] = distribution(g);
   }
@@ -34,13 +24,13 @@ bool Gaussian(platform::CPUDeviceContext& ctx,
 
 #ifndef PADDLE_ONLY_CPU
 template <typename T>
-bool Gaussian(platform::CUDADeviceContext& ctx,
-              framework::Tensor* output,
+bool Gaussian(platform::CUDADeviceContext* ctx,
+              T* output,
               const int size,
               const T& mean,
               const T& std,
               const T& seed) {
-  auto g = RandGenerator(seed);
+  auto g = ctx->RandGenerator(seed);
   return curandGenerateNormal(g, output, size, mean, std);
 }
 #endif
@@ -53,13 +43,24 @@ public:
     auto std = context.op_.GetAttr<T>("std");
     auto seed = context.op_.GetAttr<T>("seed");
     auto* output = context.Output(0)->GetMutable<framework::Tensor>();
-    output->mutable_data<T>(context.GetPlace());
-    Gaussian(context.device_context_,
-             output,
-             framework::product(output->dims()),
-             mean,
-             std,
-             seed);
+    auto place = context.GetPlace();
+    if (platform::is_cpu_place(place)) {
+      Gaussian(
+          dynamic_cast<platform::CPUDeviceContext*>(context.device_context_),
+          output->mutable_data<T>(context.GetPlace()),
+          framework::product(output->dims()),
+          mean,
+          std,
+          seed);
+    } else {
+      Gaussian(
+          dynamic_cast<platform::CUDADeviceContext*>(context.device_context_),
+          output->mutable_data<T>(context.GetPlace()),
+          framework::product(output->dims()),
+          mean,
+          std,
+          seed);
+    }
   }
 };
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index b8af4abd7f..7bc34bd545 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -49,7 +49,7 @@ class CPUDeviceContext : public DeviceContext {
     return retv;
   }
 
-  const random_generator_type& RandGenerator(const int seed) {
+  random_generator_type& RandGenerator(const int seed) {
     if (!rand_generator_) {
       random_seed_ = seed;
       rand_generator_.reset(new random_generator_type(random_seed_));
@@ -98,7 +98,7 @@ class CUDADeviceContext : public DeviceContext {
                    "cudaStreamSynchronize failed");
   }
 
-  const curandGenerator_t RandGenerator(const int seed) {
+  curandGenerator_t RandGenerator(const int seed) {
     if (!rand_generator_) {
       random_seed_ = seed;
       GPUPlaceGuard guard(gpu_place_);

From 9a16327b15cc5c094a3f6373e6bc089d26ee5b00 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 26 Jul 2017 00:44:42 +0800
Subject: [PATCH 09/51] "remove const qualify"

---
 paddle/framework/net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 3264f1f565..d3e3e80d5b 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -71,7 +71,7 @@ class PlainNet : public Net {
    * will be used.
    */
   void Run(const std::shared_ptr<Scope>& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           platform::DeviceContext& dev_ctx) const override {
     for (auto& op : ops_) {
       op->Run(scope, dev_ctx);
     }

From 69b1b26511d6a838b4542e7844fd13fd257d96b8 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 26 Jul 2017 00:47:22 +0800
Subject: [PATCH 10/51] "cpu only macro"

---
 paddle/operators/random_op.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/operators/random_op.h b/paddle/operators/random_op.h
index 8231b6b613..26dba130e4 100644
--- a/paddle/operators/random_op.h
+++ b/paddle/operators/random_op.h
@@ -53,6 +53,7 @@ public:
           std,
           seed);
     } else {
+#ifndef PADDLE_ONLY_CPU
       Gaussian(
           dynamic_cast<platform::CUDADeviceContext*>(context.device_context_),
           output->mutable_data<T>(context.GetPlace()),
@@ -60,6 +61,7 @@ public:
           mean,
           std,
           seed);
+#endif
     }
   }
 };

From a22567ebefbd29644603b66f44273bfb33fc8434 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 26 Jul 2017 00:51:46 +0800
Subject: [PATCH 11/51] "fix almost equal error"

---
 python/paddle/v2/framework/tests/test_random_op.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_random_op.py b/python/paddle/v2/framework/tests/test_random_op.py
index e2aa9bdfc2..447e3e39ab 100644
--- a/python/paddle/v2/framework/tests/test_random_op.py
+++ b/python/paddle/v2/framework/tests/test_random_op.py
@@ -21,8 +21,8 @@ class TestRandomOp(unittest.TestCase):
         ctx = core.DeviceContext.cpu_context()
         op.run(scope, ctx)
         tensor_array = numpy.array(tensor)
-        self.assertAlmostEqual(numpy.std(tensor_array), 1.0)
-        self.assertAlmostEqual(numpy.mean(tensor_array), 5.0)
+        self.assertAlmostEqual(numpy.mean(tensor_array), 5.0, places=3)
+        self.assertAlmostEqual(numpy.std(tensor_array), 1.0, places=3)
 
 
 if __name__ == '__main__':

From 572133400d3f4073d9a9206db5ed1ced3e39623d Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 30 Jul 2017 22:13:26 +0800
Subject: [PATCH 12/51] "update the compute kernel"

---
 paddle/framework/operator.h      |  8 ++---
 paddle/operators/random_op.cc    | 47 +++++++++++++++++++-------
 paddle/operators/random_op.cu    | 25 +++++++++++++-
 paddle/operators/random_op.h     | 57 ++------------------------------
 paddle/platform/device_context.h | 19 +++++++----
 5 files changed, 77 insertions(+), 79 deletions(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 5db041ea32..9ba661968c 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -88,7 +88,7 @@ class OperatorBase {
 
   /// Net will call this function to Run an op.
   virtual void Run(const std::shared_ptr<Scope>& scope,
-                   platform::DeviceContext& dev_ctx) const = 0;
+                   const platform::DeviceContext& dev_ctx) const = 0;
 
   // Get a input with argument's name described in `op_proto`
   const std::string& Input(const std::string& name) const;
@@ -113,7 +113,7 @@ class OperatorBase {
 class KernelContext {
  public:
   KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
-                platform::DeviceContext& device_context)
+                const platform::DeviceContext& device_context)
       : op_(*op), scope_(scope), device_context_(&device_context) {}
 
   const Variable* Input(int index) const {
@@ -159,7 +159,7 @@ class KernelContext {
 
   const OperatorBase& op_;
   const std::shared_ptr<Scope> scope_;
-  platform::DeviceContext* device_context_;
+  const platform::DeviceContext* device_context_;
 };
 
 class OpKernel {
@@ -213,7 +213,7 @@ class OperatorWithKernel : public OperatorBase {
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
   void Run(const std::shared_ptr<Scope>& scope,
-           platform::DeviceContext& dev_ctx) const final {
+           const platform::DeviceContext& dev_ctx) const final {
     auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
     opKernel->Compute(KernelContext(this, scope, dev_ctx));
   }
diff --git a/paddle/operators/random_op.cc b/paddle/operators/random_op.cc
index 726f6504e7..16e526dc4f 100644
--- a/paddle/operators/random_op.cc
+++ b/paddle/operators/random_op.cc
@@ -19,7 +19,28 @@
 namespace paddle {
 namespace operators {
 
-class RandomOp : public framework::OperatorWithKernel {
+template <typename T>
+class GaussianRandomOpKernel<platform::CPUPlace, T>
+    : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext& context) const override {
+    auto mean = context.op_.GetAttr<T>("mean");
+    auto std = context.op_.GetAttr<T>("std");
+    // auto seed = context.op_.GetAttr<T>("seed");
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+    T* r = output->mutable_data<T>(context.GetPlace());
+    auto ctx =
+        static_cast<const platform::CPUDeviceContext*>(context.device_context_);
+    // generator need to modify context
+    auto g = const_cast<platform::CPUDeviceContext*>(ctx)->RandGenerator();
+    std::normal_distribution<T> distribution(mean, std);
+    for (int i = 0; i < framework::product(output->dims()); ++i) {
+      r[i] = distribution(g);
+    }
+  }
+};
+
+class GaussianRandomOp : public framework::OperatorWithKernel {
 protected:
   void InferShape(
       const std::vector<const framework::Tensor*>& inputs,
@@ -33,20 +54,21 @@ protected:
   }
 };
 
-class RandomOpMaker : public framework::OpProtoAndCheckerMaker {
+class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RandomOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GaussianRandomOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<std::vector<int>>("shape", "The shape of matrix to be randomized");
-    AddAttr<float>("seed", "random seed generator.").SetDefault(1337);
+    // AddAttr<float>("seed", "random seed generator.").SetDefault(1337);
     AddAttr<float>("mean", "mean value of random.").SetDefault(.0);
     AddAttr<float>("std", "minimum value of random value")
         .SetDefault(1.0)
         .LargerThan(.0);
     AddOutput("Out", "output matrix of random op");
     AddComment(R"DOC(
-Random Operator fill a matrix in normal distribution.
-The eqution : Out = Random(Shape=(d0, d1, ...), Dtype, mean, std)
+GaussianRandom Operator fill a matrix in normal distribution.
+The eqution : Out = GaussianRandom(Shape=(d0, d1, ...), Dtype, mean, std)
 )DOC");
   }
 };
@@ -54,10 +76,11 @@ The eqution : Out = Random(Shape=(d0, d1, ...), Dtype, mean, std)
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(random,
-            paddle::operators::RandomOp,
-            paddle::operators::RandomOpMaker);
+REGISTER_OP(gaussian_random,
+            paddle::operators::GaussianRandomOp,
+            paddle::operators::GaussianRandomOpMaker);
 
-typedef paddle::operators::RandomOpKernel<paddle::platform::CPUPlace, float>
-    RandomOpKernel_CPU_float;
-REGISTER_OP_CPU_KERNEL(random, RandomOpKernel_CPU_float);
+typedef paddle::operators::GaussianRandomOpKernel<paddle::platform::CPUPlace,
+                                                  float>
+    GaussianRandomOpKernel_CPU_float;
+REGISTER_OP_CPU_KERNEL(gaussian_random, GaussianRandomOpKernel_CPU_float);
diff --git a/paddle/operators/random_op.cu b/paddle/operators/random_op.cu
index b417666c98..78a00bc899 100644
--- a/paddle/operators/random_op.cu
+++ b/paddle/operators/random_op.cu
@@ -1,7 +1,30 @@
 #include "paddle/operators/random_op.h"
 #include "paddle/framework/op_registry.h"
 
+namespace paddle {
+namespace operators {
+  
+template<typename T>
+class GaussianRandomOpKernel<platform::GPUPlace, T> : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext& context) const override {
+    auto mean = context.op_.GetAttr<T>("mean");
+    auto std = context.op_.GetAttr<T>("std");
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+    T* r = output->mutable_data<T>(context.GetPlace());
+    auto ctx = static_cast<const platform::GPUDeviceContext*>
+      (context.device_context_);
+    // generator need to modify context 
+    auto g = const_cast<platform::GPUDeviceContext*>(ctx)->RandGenerator();
+    curandGenerateNormal(g, r, framework::product(output->dims()), mean, std);
 
-typedef paddle::operators::RandomOpKernel<paddle::platform::GPUPlace, float>
+  }
+};
+  
+}  // namespace operators
+}  // namespace paddle
+  
+
+typedef paddle::operators::GaussianRandomOpKernel<paddle::platform::GPUPlace, float>
   RandomOpKernel_GPU_float;
 REGISTER_OP_GPU_KERNEL(random, RandomOpKernel_GPU_float);
\ No newline at end of file
diff --git a/paddle/operators/random_op.h b/paddle/operators/random_op.h
index 26dba130e4..b463a171d9 100644
--- a/paddle/operators/random_op.h
+++ b/paddle/operators/random_op.h
@@ -7,63 +7,10 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
-bool Gaussian(platform::CPUDeviceContext* ctx,
-              T* output,
-              const int size,
-              const T& mean,
-              const T& std,
-              const T& seed) {
-  auto g = ctx->RandGenerator(seed);
-  std::normal_distribution<T> distribution(mean, std);
-  for (int i = 0; i < size; ++i) {
-    output[i] = distribution(g);
-  }
-  return true;
-}
-
-#ifndef PADDLE_ONLY_CPU
-template <typename T>
-bool Gaussian(platform::CUDADeviceContext* ctx,
-              T* output,
-              const int size,
-              const T& mean,
-              const T& std,
-              const T& seed) {
-  auto g = ctx->RandGenerator(seed);
-  return curandGenerateNormal(g, output, size, mean, std);
-}
-#endif
-
 template <typename Place, typename T>
-class RandomOpKernel : public framework::OpKernel {
+class GaussianRandomOpKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext& context) const override {
-    auto mean = context.op_.GetAttr<T>("mean");
-    auto std = context.op_.GetAttr<T>("std");
-    auto seed = context.op_.GetAttr<T>("seed");
-    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
-    auto place = context.GetPlace();
-    if (platform::is_cpu_place(place)) {
-      Gaussian(
-          dynamic_cast<platform::CPUDeviceContext*>(context.device_context_),
-          output->mutable_data<T>(context.GetPlace()),
-          framework::product(output->dims()),
-          mean,
-          std,
-          seed);
-    } else {
-#ifndef PADDLE_ONLY_CPU
-      Gaussian(
-          dynamic_cast<platform::CUDADeviceContext*>(context.device_context_),
-          output->mutable_data<T>(context.GetPlace()),
-          framework::product(output->dims()),
-          mean,
-          std,
-          seed);
-#endif
-    }
-  }
+  void Compute(const framework::KernelContext& context) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 7bc34bd545..239c25a90c 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
+#include <chrono>
 #include <memory>
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -40,7 +41,10 @@ class DeviceContext {
 class CPUDeviceContext : public DeviceContext {
  public:
   typedef std::mt19937 random_generator_type;
-  CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); }
+  CPUDeviceContext() {
+    random_seed_ = std::chrono::system_clock::now().time_since_epoch().count();
+    eigen_device_.reset(new Eigen::DefaultDevice());
+  }
 
   Eigen::DefaultDevice* eigen_device() const { return eigen_device_.get(); }
 
@@ -49,16 +53,15 @@ class CPUDeviceContext : public DeviceContext {
     return retv;
   }
 
-  random_generator_type& RandGenerator(const int seed) {
+  random_generator_type& RandGenerator() {
     if (!rand_generator_) {
-      random_seed_ = seed;
       rand_generator_.reset(new random_generator_type(random_seed_));
     }
     return *rand_generator_.get();
   }
 
  private:
-  int random_seed_;
+  unsigned random_seed_;
   std::unique_ptr<random_generator_type> rand_generator_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
@@ -81,6 +84,9 @@ class GPUPlaceGuard {
 
 class CUDADeviceContext : public DeviceContext {
  public:
+  CUDADeviceContext() {
+    random_seed_ = std::chrono::system_clock::now().time_since_epoch().count();
+  }
   explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
     GPUPlaceGuard guard(gpu_place_);
     PADDLE_ENFORCE(cudaStreamCreate(&stream_), "cudaStreamCreate failed");
@@ -98,9 +104,8 @@ class CUDADeviceContext : public DeviceContext {
                    "cudaStreamSynchronize failed");
   }
 
-  curandGenerator_t RandGenerator(const int seed) {
+  curandGenerator_t RandGenerator() {
     if (!rand_generator_) {
-      random_seed_ = seed;
       GPUPlaceGuard guard(gpu_place_);
       PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
                          &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT),
@@ -177,7 +182,7 @@ class CUDADeviceContext : public DeviceContext {
 
   cudnnHandle_t dnn_handle_{nullptr};
 
-  int random_seed_;
+  unsigned random_seed_;
   curandGenerator_t rand_generator_{nullptr};
 };
 

From 36d7e1fdae2ffc937c29792882d26c492e9f6784 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 30 Jul 2017 22:20:42 +0800
Subject: [PATCH 13/51] "fix const hell"

---
 paddle/framework/net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index d3e3e80d5b..3264f1f565 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -71,7 +71,7 @@ class PlainNet : public Net {
    * will be used.
    */
   void Run(const std::shared_ptr<Scope>& scope,
-           platform::DeviceContext& dev_ctx) const override {
+           const platform::DeviceContext& dev_ctx) const override {
     for (auto& op : ops_) {
       op->Run(scope, dev_ctx);
     }

From 0253f2c46fcab2410d01004115a16db7f9b393be Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 30 Jul 2017 22:26:20 +0800
Subject: [PATCH 14/51] "fix bind python error"

---
 paddle/pybind/pybind.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 1138b07ec4..88deb56207 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -36,7 +36,7 @@ USE_OP(mul);
 USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
-USE_OP(random);
+USE_OP(gaussian_random);
 
 template <typename ClassType>
 void ExposeOperator(ClassType& m) {

From 4d8ece8a3c46b0c4ddec77dd3f4f70275c5c274c Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 30 Jul 2017 22:34:22 +0800
Subject: [PATCH 15/51] "update"

---
 python/paddle/v2/framework/tests/test_random_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/framework/tests/test_random_op.py b/python/paddle/v2/framework/tests/test_random_op.py
index 447e3e39ab..1c9ff350df 100644
--- a/python/paddle/v2/framework/tests/test_random_op.py
+++ b/python/paddle/v2/framework/tests/test_random_op.py
@@ -9,7 +9,7 @@ class TestRandomOp(unittest.TestCase):
     def test_random(self):
         scope = core.Scope(None)
         # Out = scope.create_var("Out")
-        op = creation.op_creations.random(
+        op = creation.op_creations.gaussian_random(
             shape=[1000, 1000], mean=5.0, std=1.0, seed=1701, Out="Out")
         for out in op.outputs():
             if scope.get_var(out) is None:

From 47556689d4bb53237470bdc9deca5e54df5bda8b Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 30 Jul 2017 22:38:43 +0800
Subject: [PATCH 16/51] "remove unused code"

---
 paddle/operators/random_op.cc                      | 2 --
 python/paddle/v2/framework/tests/test_random_op.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/operators/random_op.cc b/paddle/operators/random_op.cc
index 16e526dc4f..674c851345 100644
--- a/paddle/operators/random_op.cc
+++ b/paddle/operators/random_op.cc
@@ -26,7 +26,6 @@ public:
   void Compute(const framework::KernelContext& context) const override {
     auto mean = context.op_.GetAttr<T>("mean");
     auto std = context.op_.GetAttr<T>("std");
-    // auto seed = context.op_.GetAttr<T>("seed");
     auto* output = context.Output(0)->GetMutable<framework::Tensor>();
     T* r = output->mutable_data<T>(context.GetPlace());
     auto ctx =
@@ -60,7 +59,6 @@ public:
                         framework::OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<std::vector<int>>("shape", "The shape of matrix to be randomized");
-    // AddAttr<float>("seed", "random seed generator.").SetDefault(1337);
     AddAttr<float>("mean", "mean value of random.").SetDefault(.0);
     AddAttr<float>("std", "minimum value of random value")
         .SetDefault(1.0)
diff --git a/python/paddle/v2/framework/tests/test_random_op.py b/python/paddle/v2/framework/tests/test_random_op.py
index 1c9ff350df..d3474880d3 100644
--- a/python/paddle/v2/framework/tests/test_random_op.py
+++ b/python/paddle/v2/framework/tests/test_random_op.py
@@ -10,7 +10,7 @@ class TestRandomOp(unittest.TestCase):
         scope = core.Scope(None)
         # Out = scope.create_var("Out")
         op = creation.op_creations.gaussian_random(
-            shape=[1000, 1000], mean=5.0, std=1.0, seed=1701, Out="Out")
+            shape=[1000, 1000], mean=5.0, std=1.0, Out="Out")
         for out in op.outputs():
             if scope.get_var(out) is None:
                 scope.create_var(out).get_tensor()

From 49739265c728575734afd6079c911f8383d88346 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 30 Jul 2017 22:46:56 +0800
Subject: [PATCH 17/51] "fix register error"

---
 paddle/operators/random_op.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/operators/random_op.cu b/paddle/operators/random_op.cu
index 78a00bc899..c63eafb0a1 100644
--- a/paddle/operators/random_op.cu
+++ b/paddle/operators/random_op.cu
@@ -27,4 +27,4 @@ public:
 
 typedef paddle::operators::GaussianRandomOpKernel<paddle::platform::GPUPlace, float>
   RandomOpKernel_GPU_float;
-REGISTER_OP_GPU_KERNEL(random, RandomOpKernel_GPU_float);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(gaussian_random, GaussianRandomOpKernel_GPU_float);
\ No newline at end of file

From 0802197924d884c7d8a9531c541d9d4e4f376885 Mon Sep 17 00:00:00 2001
From: Zhuoyuan <chenzhuoyuan07@gmail.com>
Date: Wed, 2 Aug 2017 16:00:06 -0700
Subject: [PATCH 18/51] gather and scatter-update added

---
 paddle/operators/gather_func.h  | 114 ++++++++++++++++++++++++++++++
 paddle/operators/scatter_func.h | 119 ++++++++++++++++++++++++++++++++
 2 files changed, 233 insertions(+)
 create mode 100644 paddle/operators/gather_func.h
 create mode 100644 paddle/operators/scatter_func.h

diff --git a/paddle/operators/gather_func.h b/paddle/operators/gather_func.h
new file mode 100644
index 0000000000..09e751ce17
--- /dev/null
+++ b/paddle/operators/gather_func.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cstring>
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+#include "paddle/framework/ddim.h"
+
+/**
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[Index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename place, typename T>
+Tensor* Gather_func(Tensor* Src, Tensor* Index) {
+	// assert index is an int-type tensor?
+	// assert(Index->istype(int));
+
+	// check index of shape 1-D
+	assert(Index->dims().size()==1);
+	int index_size = Index->dims()[0];
+
+	// Source shape
+	auto src_dims = Src->dims();
+	DDim output_dims(dims_src);
+	// Create a tensor of shape [index_size, dim_src[1:]]
+	output_dims[0] = index_size;
+
+	Tensor* New_tensor;
+	float* output = nullptr;
+
+	/* slice size */
+	int slice_size = 1;
+	for(unsigned int i = 0; i < src_dims.size(); ++i)
+		slice_size *= src_dims[i];
+
+	/* Gathering */
+	if (place == CPUPlace()) {
+		// init for CPU
+		output = New_tensor.mutable_data<T>(output_dims, CPUPlace());
+		CPUGather(Src->data(), Index->data(), slice_size, new_tensor->mutable_data());
+	} else { // GPU
+		// init for GPU
+		output = New_tensor.mutable_data<T>(output_dims, GPUPlace());
+		/* how to specialize device??*/
+		GPUGather(d, Src->data(), Index->data(), slice_size, new_tensor->mutable_data());
+	}
+	return New_tensor;
+}
+
+/* Implementation of CPU copy */
+template<typename T>
+void CPUGather(const T* params, const int* indices, 
+			   const int slice_size, const int index_size,
+			   T* output) {
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for(int i = 0; i < index_size; ++i)
+  	int index_ = indices[i];
+  	/* copy src[index_] to output[i] */
+  	memcpy(output + i * slice_bytes,
+  		params + index_ * slice_bytes,
+  		slice_bytes);
+}
+
+/* Implementation of GPU copy:
+   I suppose the GPUDevice& d, contains gpu_id and thread_id
+   d = cuda_stream(gpu_id_, stream_id_);
+*/
+template<typename T>
+void GPUGather(const GPUDevice& d,
+			   const T* src, const int* Index, 
+	           const int slice_size, const int index_size,
+	           T* output) {
+	int block_count = slice_size * index_size;
+	int thread_per_block = 1024;
+
+	GatherOpKernel<T>
+          <<<block_count, thread_per_block, 0, d.stream()>>>(
+              src, Index, output, slice_size,
+              indices_size, slice_size, out_size);
+}
+
+template <typename T>
+__global__ void GatherOpKernel(const T* params, const int* indices, T* out,
+                               int64 indices_size,
+                               int64 slice_size, int64 out_size) {
+  /* I suppose we have the following macro, 
+     which I strongly suggest that we should put in cuda:
+  #define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+  */
+  CUDA_1D_KERNEL_LOOP(i, out_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size; // offset inside the slice
+    int gather_i = indices[indices_i];
+    int params_i = gather_i * slice_size + slice_i;
+    out[i] = *(params + params_i);
+  } 
+}
diff --git a/paddle/operators/scatter_func.h b/paddle/operators/scatter_func.h
new file mode 100644
index 0000000000..6ee3fdf3a3
--- /dev/null
+++ b/paddle/operators/scatter_func.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cstring>
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+#include "paddle/framework/ddim.h"
+
+/**
+ * Return a updated tensor from source tensor, scattered according to index:
+ * dst[i] += src[index[i]]
+ * input[src]: type-T source Tensor
+ * input[Index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename place, typename T>
+void ScatterUpdate_func(Tensor* Src, Tensor* Dst, Tensor* Index) {
+	// assert index is an int-type tensor
+	assert(Index->istype(int));
+
+	// Source shape
+	auto src_dims = Src->dims();
+	auto dst_dims = Dst->dims();
+	DDim output_dims(dims_src);
+
+	// check Src shape and Dst shape should match
+	for(int i = 1; i < src_dims.size(); i++)
+		assert(src_dims[i]==dst_dims[i]);
+
+	int index_size = Index->dims()[0];
+
+	/* slice size */
+	int slice_size = 1;
+	for(unsigned int i = 0; i < src_dims.size(); ++i)
+		slice_size *= src_dims[i];
+
+	if (place == CPUPlace()) {
+		// init
+		output = new_tensor.mutable_data<T>(output_dims, CPUPlace());
+		CPUScatterUpdate(src->data(), index->data(), slice_size, new_tensor->mutable_data());
+
+	} else { // GPU
+		// init
+		output = new_tensor.mutable_data<T>(output_dims, GPUPlace());
+		/* how to specialize device??*/
+		GPUScatterUpdate(d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
+	}
+}
+
+/* Implementation of CPU copy */
+template<typename T>
+void CPUScatterUpdate(const T* src, const int* Index, 
+			   const int slice_size, const int index_size,
+			   T* output) {
+  //const size_t slice_bytes = slice_size * sizeof(T);
+
+  for(int i = 0; i < index_size; ++i)
+  	int index_ = index[i];
+  	/* dst[index_] += src[index_]
+  	   add operation size: slice_size
+  	 */
+    math::vAdd<T>(slice_size, src + index_ * slice_bytes,
+    	            output + i * slice_bytes, 
+    				output + i * slice_bytes);
+  	/* Scatter update, not just assign
+  	memcpy(output + i * slice_bytes,
+  		src + index_ * slice_bytes,
+  		slice_bytes);
+  	*/
+}
+
+/* Implementation of GPU scatter:
+   I suppose the GPUDevice& d, contains gpu_id and thread_id
+   d = cuda_stream(gpu_id_, stream_id_);
+*/
+template<typename T>
+void GPUScatterUpdate(const GPUDevice& d,
+			   const T* src, const int* Index, 
+	           const int slice_size, const int index_size,
+	           T* output) {
+	int block_count = slice_size * index_size;
+	int thread_per_block = 1024;
+
+	ScatterOpKernel<T>
+          <<<block_count, thread_per_block, 0, d.stream()>>>(
+              src, Index, output, slice_size,
+              indices_size, slice_size, out_size);
+}
+
+template <typename T>
+__global__ void ScatterOpKernel(const T* params, const int* indices, T* out,
+                               int64 indices_size,
+                               int64 slice_size, int64 out_size) {
+  /* I suppose we have the following macro, 
+     which I strongly suggest that we should put in cuda:
+  #define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+  */
+  CUDA_1D_KERNEL_LOOP(i, out_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size; // offset inside the slice
+    int scatter_i = indices[indices_i];
+    int params_i = scatter_i * slice_size + slice_i;
+    out[i] += *(params + params_i);
+  } 
+}

From 2b35fca18f66e5f92315e369a687a5e908aedf1e Mon Sep 17 00:00:00 2001
From: Zhuoyuan <chenzhuoyuan07@gmail.com>
Date: Wed, 2 Aug 2017 22:34:58 -0700
Subject: [PATCH 19/51] gather modify

---
 paddle/operators/gather_func.h | 71 ++++++++++++++++------------------
 1 file changed, 34 insertions(+), 37 deletions(-)

diff --git a/paddle/operators/gather_func.h b/paddle/operators/gather_func.h
index 09e751ce17..e255bd7d15 100644
--- a/paddle/operators/gather_func.h
+++ b/paddle/operators/gather_func.h
@@ -21,44 +21,41 @@ limitations under the License. */
 /**
  * Return a new tensor from source tensor, gathered according to index
  * input[src]: type-T source Tensor
- * input[Index]: type-int index Tensor (1-D)
+ * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
-template <typename place, typename T>
-Tensor* Gather_func(Tensor* Src, Tensor* Index) {
-	// assert index is an int-type tensor?
-	// assert(Index->istype(int));
+template <typename Place, typename T>
+Tensor* Gather(Tensor* src, Tensor* index) {
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index->dims().size()==1);
+  int index_size = index->dims()[0];
 
-	// check index of shape 1-D
-	assert(Index->dims().size()==1);
-	int index_size = Index->dims()[0];
+  // Source shape
+  auto src_dims = src->dims();
+  DDim output_dims(dims_src);
+  // Create a tensor of shape [index_size, dim_src[1:]]
+  output_dims[0] = index_size;
 
-	// Source shape
-	auto src_dims = Src->dims();
-	DDim output_dims(dims_src);
-	// Create a tensor of shape [index_size, dim_src[1:]]
-	output_dims[0] = index_size;
+  Tensor* New_tensor;
+  float* output = nullptr;
 
-	Tensor* New_tensor;
-	float* output = nullptr;
+  /* slice size */
+  int slice_size = 1;
+  for(unsigned int i = 0; i < src_dims.size(); ++i)
+	slice_size *= src_dims[i];
 
-	/* slice size */
-	int slice_size = 1;
-	for(unsigned int i = 0; i < src_dims.size(); ++i)
-		slice_size *= src_dims[i];
-
-	/* Gathering */
-	if (place == CPUPlace()) {
-		// init for CPU
-		output = New_tensor.mutable_data<T>(output_dims, CPUPlace());
-		CPUGather(Src->data(), Index->data(), slice_size, new_tensor->mutable_data());
-	} else { // GPU
-		// init for GPU
-		output = New_tensor.mutable_data<T>(output_dims, GPUPlace());
-		/* how to specialize device??*/
-		GPUGather(d, Src->data(), Index->data(), slice_size, new_tensor->mutable_data());
-	}
-	return New_tensor;
+  /* Gathering */
+  if (place == CPUPlace()) {
+	// init for CPU
+	output = New_tensor.mutable_data<T>(output_dims, CPUPlace());
+	CPUGather(src->data(), index->data(), slice_size, new_tensor->mutable_data());
+  } else { // GPU
+	// init for GPU
+	output = New_tensor.mutable_data<T>(output_dims, GPUPlace());
+	/* how to specialize device??*/
+	GPUGather(d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
+  }
+  return New_tensor;
 }
 
 /* Implementation of CPU copy */
@@ -82,15 +79,15 @@ void CPUGather(const T* params, const int* indices,
 */
 template<typename T>
 void GPUGather(const GPUDevice& d,
-			   const T* src, const int* Index, 
+			   const T* src, const int* index, 
 	           const int slice_size, const int index_size,
 	           T* output) {
-	int block_count = slice_size * index_size;
-	int thread_per_block = 1024;
+  int block_count = slice_size * index_size;
+  int thread_per_block = 1024;
 
-	GatherOpKernel<T>
+  GatherOpKernel<T>
           <<<block_count, thread_per_block, 0, d.stream()>>>(
-              src, Index, output, slice_size,
+              src, index, output, slice_size,
               indices_size, slice_size, out_size);
 }
 

From eef55ca700a4f75e76996bbab04224470bb80f36 Mon Sep 17 00:00:00 2001
From: Zhuoyuan <chenzhuoyuan07@gmail.com>
Date: Thu, 3 Aug 2017 01:02:40 -0700
Subject: [PATCH 20/51] remodify

---
 paddle/operators/gather_func.h  |  76 ++++++++++--------
 paddle/operators/scatter_func.h | 137 ++++++++++++++++----------------
 2 files changed, 108 insertions(+), 105 deletions(-)

diff --git a/paddle/operators/gather_func.h b/paddle/operators/gather_func.h
index e255bd7d15..5975675cbb 100644
--- a/paddle/operators/gather_func.h
+++ b/paddle/operators/gather_func.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <cstring>
+#include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/place.h"
-#include "paddle/framework/ddim.h"
 
 /**
  * Return a new tensor from source tensor, gathered according to index
@@ -27,7 +27,7 @@ limitations under the License. */
 template <typename Place, typename T>
 Tensor* Gather(Tensor* src, Tensor* index) {
   // check index of shape 1-D
-  PADDLE_ENFORCE(index->dims().size()==1);
+  PADDLE_ENFORCE(index->dims().size() == 1);
   int index_size = index->dims()[0];
 
   // Source shape
@@ -41,61 +41,67 @@ Tensor* Gather(Tensor* src, Tensor* index) {
 
   /* slice size */
   int slice_size = 1;
-  for(unsigned int i = 0; i < src_dims.size(); ++i)
-	slice_size *= src_dims[i];
+  for (size_t i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
   /* Gathering */
   if (place == CPUPlace()) {
-	// init for CPU
-	output = New_tensor.mutable_data<T>(output_dims, CPUPlace());
-	CPUGather(src->data(), index->data(), slice_size, new_tensor->mutable_data());
-  } else { // GPU
-	// init for GPU
-	output = New_tensor.mutable_data<T>(output_dims, GPUPlace());
-	/* how to specialize device??*/
-	GPUGather(d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
+    // init for CPU
+    output = New_tensor.mutable_data<T>(output_dims, CPUPlace());
+    CPUGather(
+        src->data(), index->data(), slice_size, new_tensor->mutable_data());
+  } else {  // GPU
+    // init for GPU
+    output = New_tensor.mutable_data<T>(output_dims, GPUPlace());
+    /* how to specialize device??*/
+    GPUGather(
+        d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
   }
   return New_tensor;
 }
 
 /* Implementation of CPU copy */
-template<typename T>
-void CPUGather(const T* params, const int* indices, 
-			   const int slice_size, const int index_size,
-			   T* output) {
+template <typename T>
+void CPUGather(const T* params,
+               const int* indices,
+               const int slice_size,
+               const int index_size,
+               T* output) {
   const size_t slice_bytes = slice_size * sizeof(T);
 
-  for(int i = 0; i < index_size; ++i)
-  	int index_ = indices[i];
-  	/* copy src[index_] to output[i] */
-  	memcpy(output + i * slice_bytes,
-  		params + index_ * slice_bytes,
-  		slice_bytes);
+  for (size_t i = 0; i < index_size; ++i) {
+    int index_ = indices[i];
+    /* copy src[index_] to output[i] */
+    memcpy(
+        output + i * slice_bytes, params + index_ * slice_bytes, slice_bytes);
+  }
 }
 
 /* Implementation of GPU copy:
    I suppose the GPUDevice& d, contains gpu_id and thread_id
    d = cuda_stream(gpu_id_, stream_id_);
 */
-template<typename T>
+template <typename T>
 void GPUGather(const GPUDevice& d,
-			   const T* src, const int* index, 
-	           const int slice_size, const int index_size,
-	           T* output) {
+               const T* src,
+               const int* index,
+               const int slice_size,
+               const int index_size,
+               T* output) {
   int block_count = slice_size * index_size;
   int thread_per_block = 1024;
 
-  GatherOpKernel<T>
-          <<<block_count, thread_per_block, 0, d.stream()>>>(
-              src, index, output, slice_size,
-              indices_size, slice_size, out_size);
+  GatherOpKernel<T><<<block_count, thread_per_block, 0, d.stream()>>>(
+      src, index, output, slice_size, indices_size, slice_size, out_size);
 }
 
 template <typename T>
-__global__ void GatherOpKernel(const T* params, const int* indices, T* out,
+__global__ void GatherOpKernel(const T* params,
+                               const int* indices,
+                               T* out,
                                int64 indices_size,
-                               int64 slice_size, int64 out_size) {
-  /* I suppose we have the following macro, 
+                               int64 slice_size,
+                               int64 out_size) {
+  /* I suppose we have the following macro,
      which I strongly suggest that we should put in cuda:
   #define CUDA_1D_KERNEL_LOOP(i, n)                            \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
@@ -103,9 +109,9 @@ __global__ void GatherOpKernel(const T* params, const int* indices, T* out,
   */
   CUDA_1D_KERNEL_LOOP(i, out_size) {
     int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size; // offset inside the slice
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
     int gather_i = indices[indices_i];
     int params_i = gather_i * slice_size + slice_i;
     out[i] = *(params + params_i);
-  } 
+  }
 }
diff --git a/paddle/operators/scatter_func.h b/paddle/operators/scatter_func.h
index 6ee3fdf3a3..53b260170f 100644
--- a/paddle/operators/scatter_func.h
+++ b/paddle/operators/scatter_func.h
@@ -14,96 +14,93 @@ limitations under the License. */
 
 #pragma once
 #include <cstring>
+#include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/place.h"
-#include "paddle/framework/ddim.h"
 
 /**
  * Return a updated tensor from source tensor, scattered according to index:
  * dst[i] += src[index[i]]
  * input[src]: type-T source Tensor
- * input[Index]: type-int index Tensor (1-D)
+ * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
-template <typename place, typename T>
-void ScatterUpdate_func(Tensor* Src, Tensor* Dst, Tensor* Index) {
-	// assert index is an int-type tensor
-	assert(Index->istype(int));
-
-	// Source shape
-	auto src_dims = Src->dims();
-	auto dst_dims = Dst->dims();
-	DDim output_dims(dims_src);
-
-	// check Src shape and Dst shape should match
-	for(int i = 1; i < src_dims.size(); i++)
-		assert(src_dims[i]==dst_dims[i]);
-
-	int index_size = Index->dims()[0];
-
-	/* slice size */
-	int slice_size = 1;
-	for(unsigned int i = 0; i < src_dims.size(); ++i)
-		slice_size *= src_dims[i];
-
-	if (place == CPUPlace()) {
-		// init
-		output = new_tensor.mutable_data<T>(output_dims, CPUPlace());
-		CPUScatterUpdate(src->data(), index->data(), slice_size, new_tensor->mutable_data());
-
-	} else { // GPU
-		// init
-		output = new_tensor.mutable_data<T>(output_dims, GPUPlace());
-		/* how to specialize device??*/
-		GPUScatterUpdate(d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
-	}
+template <typename Place, typename T>
+void ScatterUpdate(Tensor* src, Tensor* dst, Tensor* index) {
+  // Source shape
+  auto src_dims = src->dims();
+  auto dst_dims = dst->dims();
+  DDim output_dims(dims_src);
+
+  // check src shape and dst shape should match
+  for (size_t i = 1; i < src_dims.size(); i++)
+    PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
+
+  int index_size = index->dims()[0];
+
+  /* slice size */
+  int slice_size = 1;
+  for (size_t i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  if (place == CPUPlace()) {
+    // init
+    output = new_tensor.mutable_data<T>(output_dims, CPUPlace());
+    CPUScatterUpdate(
+        src->data(), index->data(), slice_size, new_tensor->mutable_data());
+
+  } else {  // GPU
+    // init
+    output = new_tensor.mutable_data<T>(output_dims, GPUPlace());
+    /* how to specialize device??*/
+    GPUScatterUpdate(
+        d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
+  }
 }
 
 /* Implementation of CPU copy */
-template<typename T>
-void CPUScatterUpdate(const T* src, const int* Index, 
-			   const int slice_size, const int index_size,
-			   T* output) {
-  //const size_t slice_bytes = slice_size * sizeof(T);
-
-  for(int i = 0; i < index_size; ++i)
-  	int index_ = index[i];
-  	/* dst[index_] += src[index_]
-  	   add operation size: slice_size
-  	 */
-    math::vAdd<T>(slice_size, src + index_ * slice_bytes,
-    	            output + i * slice_bytes, 
-    				output + i * slice_bytes);
-  	/* Scatter update, not just assign
-  	memcpy(output + i * slice_bytes,
-  		src + index_ * slice_bytes,
-  		slice_bytes);
-  	*/
+template <typename T>
+void CPUScatterUpdate(const T* src,
+                      const int* index,
+                      const int slice_size,
+                      const int index_size,
+                      T* output) {
+  // const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (size_t i = 0; i < index_size; ++i) {
+    int index_ = index[i];
+    math::vAdd<T>(slice_size,
+                  src + index_ * slice_bytes,
+                  output + i * slice_bytes,
+                  output + i * slice_bytes);
+  }
 }
 
 /* Implementation of GPU scatter:
    I suppose the GPUDevice& d, contains gpu_id and thread_id
    d = cuda_stream(gpu_id_, stream_id_);
 */
-template<typename T>
+template <typename T>
 void GPUScatterUpdate(const GPUDevice& d,
-			   const T* src, const int* Index, 
-	           const int slice_size, const int index_size,
-	           T* output) {
-	int block_count = slice_size * index_size;
-	int thread_per_block = 1024;
-
-	ScatterOpKernel<T>
-          <<<block_count, thread_per_block, 0, d.stream()>>>(
-              src, Index, output, slice_size,
-              indices_size, slice_size, out_size);
+                      const T* src,
+                      const int* index,
+                      const int slice_size,
+                      const int index_size,
+                      T* output) {
+  int block_count = slice_size * index_size;
+  int thread_per_block = 1024;
+
+  ScatterOpKernel<T><<<block_count, thread_per_block, 0, d.stream()>>>(
+      src, index, output, slice_size, indices_size, slice_size, out_size);
 }
 
 template <typename T>
-__global__ void ScatterOpKernel(const T* params, const int* indices, T* out,
-                               int64 indices_size,
-                               int64 slice_size, int64 out_size) {
-  /* I suppose we have the following macro, 
+__global__ void ScatterOpKernel(const T* params,
+                                const int* indices,
+                                T* out,
+                                int64 indices_size,
+                                int64 slice_size,
+                                int64 out_size) {
+  /* I suppose we have the following macro,
      which I strongly suggest that we should put in cuda:
   #define CUDA_1D_KERNEL_LOOP(i, n)                            \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
@@ -111,9 +108,9 @@ __global__ void ScatterOpKernel(const T* params, const int* indices, T* out,
   */
   CUDA_1D_KERNEL_LOOP(i, out_size) {
     int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size; // offset inside the slice
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
     int scatter_i = indices[indices_i];
     int params_i = scatter_i * slice_size + slice_i;
     out[i] += *(params + params_i);
-  } 
+  }
 }

From 6b3e0b786d9de3ef912953859e23204066aa70a4 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Sat, 5 Aug 2017 15:05:51 -0700
Subject: [PATCH 21/51] gather function with test passed

---
 paddle/operators/CMakeLists.txt |   5 ++
 paddle/operators/gather_func.cc |  19 +++++
 paddle/operators/gather_func.h  | 124 ++++++++++++++------------------
 paddle/operators/gather_test.cc |  50 +++++++++++++
 4 files changed, 126 insertions(+), 72 deletions(-)
 create mode 100644 paddle/operators/gather_func.cc
 create mode 100644 paddle/operators/gather_test.cc

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index b910bee836..10922892ca 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -41,6 +41,11 @@ function(op_library TARGET)
     endif()
 endfunction()
 
+op_library(gather SRCS gather_func.cc)
+cc_test(gather_test SRCS gather_test.cc DEPS gather)
+
+op_library(scatter SRCS scatter_func.cc)
+
 op_library(add_op SRCS add_op.cc add_op.cu)
 cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
 
diff --git a/paddle/operators/gather_func.cc b/paddle/operators/gather_func.cc
new file mode 100644
index 0000000000..a6b2331f32
--- /dev/null
+++ b/paddle/operators/gather_func.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/gather_func.h"
+#include <cstring>
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
diff --git a/paddle/operators/gather_func.h b/paddle/operators/gather_func.h
index 5975675cbb..5adc1e6b17 100644
--- a/paddle/operators/gather_func.h
+++ b/paddle/operators/gather_func.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,51 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <memory.h>
 #include <cstring>
+
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/place.h"
 
-/**
- * Return a new tensor from source tensor, gathered according to index
- * input[src]: type-T source Tensor
- * input[index]: type-int index Tensor (1-D)
- * return: output tensor
- */
-template <typename Place, typename T>
-Tensor* Gather(Tensor* src, Tensor* index) {
-  // check index of shape 1-D
-  PADDLE_ENFORCE(index->dims().size() == 1);
-  int index_size = index->dims()[0];
-
-  // Source shape
-  auto src_dims = src->dims();
-  DDim output_dims(dims_src);
-  // Create a tensor of shape [index_size, dim_src[1:]]
-  output_dims[0] = index_size;
-
-  Tensor* New_tensor;
-  float* output = nullptr;
-
-  /* slice size */
-  int slice_size = 1;
-  for (size_t i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+using paddle::framework::Tensor;
+using paddle::framework::DDim;
 
-  /* Gathering */
-  if (place == CPUPlace()) {
-    // init for CPU
-    output = New_tensor.mutable_data<T>(output_dims, CPUPlace());
-    CPUGather(
-        src->data(), index->data(), slice_size, new_tensor->mutable_data());
-  } else {  // GPU
-    // init for GPU
-    output = New_tensor.mutable_data<T>(output_dims, GPUPlace());
-    /* how to specialize device??*/
-    GPUGather(
-        d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
-  }
-  return New_tensor;
-}
+namespace paddle {
+namespace operators {
 
 /* Implementation of CPU copy */
 template <typename T>
@@ -70,48 +37,61 @@ void CPUGather(const T* params,
 
   for (size_t i = 0; i < index_size; ++i) {
     int index_ = indices[i];
-    /* copy src[index_] to output[i] */
-    memcpy(
-        output + i * slice_bytes, params + index_ * slice_bytes, slice_bytes);
+    // copy src[index_] to output[i]
+    memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
   }
 }
 
 /* Implementation of GPU copy:
-   I suppose the GPUDevice& d, contains gpu_id and thread_id
-   d = cuda_stream(gpu_id_, stream_id_);
+  I suppose the GPUDevice& d, contains gpu_id and thread_id
+  d = cuda_stream(gpu_id_, stream_id_);
 */
 template <typename T>
-void GPUGather(const GPUDevice& d,
-               const T* src,
+void GPUGather(const T* src,
                const int* index,
                const int slice_size,
                const int index_size,
-               T* output) {
-  int block_count = slice_size * index_size;
-  int thread_per_block = 1024;
-
-  GatherOpKernel<T><<<block_count, thread_per_block, 0, d.stream()>>>(
-      src, index, output, slice_size, indices_size, slice_size, out_size);
-}
+               T* output);
 
+/**
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
 template <typename T>
-__global__ void GatherOpKernel(const T* params,
-                               const int* indices,
-                               T* out,
-                               int64 indices_size,
-                               int64 slice_size,
-                               int64 out_size) {
-  /* I suppose we have the following macro,
-     which I strongly suggest that we should put in cuda:
-  #define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-  */
-  CUDA_1D_KERNEL_LOOP(i, out_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    int gather_i = indices[indices_i];
-    int params_i = gather_i * slice_size + slice_i;
-    out[i] = *(params + params_i);
+void Gather(const platform::Place& place,
+            const paddle::framework::Tensor* src,
+            const paddle::framework::Tensor* index,
+            paddle::framework::Tensor* output) {
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index->dims().size() == 1);
+  int index_size = index->dims()[0];
+
+  auto src_dims = src->dims();
+  DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (size_t i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  // Gathering
+  if (platform::is_cpu_place(place)) {
+    CPUGather<T>(src->data<T>(),
+                 index->data<int>(),
+                 slice_size,
+                 index_size,
+                 output->data<T>());
+  } else {
+    // init for GPU
+    // output_arr = output->mutable_data<T>(output_dims, platform::GPUPlace());
+    // how to specialize device??
+    // GPUGather(
+    //    d, src->data(), index->data(), slice_size,
+    //    new_tensor->mutable_data());
   }
 }
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
new file mode 100644
index 0000000000..6f220b133b
--- /dev/null
+++ b/paddle/operators/gather_test.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/gather_func.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+
+TEST(_abc_, GatherData) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  Tensor* src = new Tensor();
+  Tensor* index = new Tensor();
+  Tensor* output = new Tensor();
+  // src.Resize(make_ddim({3, 4}));
+
+  int* p_src = nullptr;
+  int* p_index = nullptr;
+  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
+  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
+
+  for (size_t i = 0; i < 12; ++i) p_src[i] = i;
+  p_index[0] = 1;
+  p_index[1] = 0;
+
+  // gather
+  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
+
+  Gather<int>(CPUPlace(), src, index, output);
+
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
+  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
+}

From 0f8c9dbe61762092a701ac035445dbae31b27338 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 6 Aug 2017 15:37:36 +0800
Subject: [PATCH 22/51] device context pointer

---
 paddle/operators/CMakeLists.txt        |  2 +-
 paddle/operators/gaussian_random_op.cc | 11 +++++------
 paddle/operators/gaussian_random_op.cu | 26 +++++++++++++-------------
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 8887dc6dbd..3b60df0218 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -55,7 +55,7 @@ op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
 
 op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu)
 op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
-op_library(guassian_random_op SRCS guassain_random_op.cc guassian_random_op.cu)
+op_library(gaussian_random_op SRCS gaussian_random_op.cc gaussian_random_op.cu)
 op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
 op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)
 
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 7afc0cd56b..f5fd902c5f 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -12,9 +12,9 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include "paddle/operators/gaussian_random_op.h"
 #include "glog/logging.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/random_op.h"
 
 namespace paddle {
 namespace operators {
@@ -22,7 +22,7 @@ namespace operators {
 template <typename T>
 class GaussianRandomOpKernel<platform::CPUPlace, T>
     : public framework::OpKernel {
-public:
+ public:
   void Compute(const framework::KernelContext& context) const override {
     auto mean = context.op_.GetAttr<T>("mean");
     auto std = context.op_.GetAttr<T>("std");
@@ -40,7 +40,7 @@ public:
 };
 
 class GaussianRandomOp : public framework::OperatorWithKernel {
-protected:
+ protected:
   void InferShape(
       const std::vector<const framework::Tensor*>& inputs,
       const std::vector<framework::Tensor*>& outputs) const override {
@@ -54,7 +54,7 @@ protected:
 };
 
 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
-public:
+ public:
   GaussianRandomOpMaker(framework::OpProto* proto,
                         framework::OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
@@ -74,8 +74,7 @@ The eqution : Out = GaussianRandom(Shape=(d0, d1, ...), Dtype, mean, std)
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(gaussian_random,
-            paddle::operators::GaussianRandomOp,
+REGISTER_OP(gaussian_random, paddle::operators::GaussianRandomOp,
             paddle::operators::GaussianRandomOpMaker);
 
 typedef paddle::operators::GaussianRandomOpKernel<paddle::platform::CPUPlace,
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index c63eafb0a1..afede378f6 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -1,30 +1,30 @@
-#include "paddle/operators/random_op.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/guassian_random_op.h"
 
 namespace paddle {
 namespace operators {
-  
-template<typename T>
-class GaussianRandomOpKernel<platform::GPUPlace, T> : public framework::OpKernel {
-public:
+
+template <typename T>
+class GaussianRandomOpKernel<platform::GPUPlace, T>
+    : public framework::OpKernel {
+ public:
   void Compute(const framework::KernelContext& context) const override {
     auto mean = context.op_.GetAttr<T>("mean");
     auto std = context.op_.GetAttr<T>("std");
     auto* output = context.Output(0)->GetMutable<framework::Tensor>();
     T* r = output->mutable_data<T>(context.GetPlace());
-    auto ctx = static_cast<const platform::GPUDeviceContext*>
-      (context.device_context_);
-    // generator need to modify context 
+    auto ctx =
+        static_cast<const platform::GPUDeviceContext*>(context.device_context_);
+    // generator need to modify context
     auto g = const_cast<platform::GPUDeviceContext*>(ctx)->RandGenerator();
     curandGenerateNormal(g, r, framework::product(output->dims()), mean, std);
-
   }
 };
-  
+
 }  // namespace operators
 }  // namespace paddle
-  
 
-typedef paddle::operators::GaussianRandomOpKernel<paddle::platform::GPUPlace, float>
-  RandomOpKernel_GPU_float;
+typedef paddle::operators::GaussianRandomOpKernel<paddle::platform::GPUPlace,
+                                                  float>
+    RandomOpKernel_GPU_float;
 REGISTER_OP_GPU_KERNEL(gaussian_random, GaussianRandomOpKernel_GPU_float);
\ No newline at end of file

From fcd6f64b98aafdb13d29395eaa3573f69632382a Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 7 Aug 2017 17:28:38 +0800
Subject: [PATCH 23/51] "redefine random op"

---
 paddle/operators/gaussian_random_op.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index f5fd902c5f..d7ced6b526 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -23,7 +23,7 @@ template <typename T>
 class GaussianRandomOpKernel<platform::CPUPlace, T>
     : public framework::OpKernel {
  public:
-  void Compute(const framework::KernelContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     auto mean = context.op_.GetAttr<T>("mean");
     auto std = context.op_.GetAttr<T>("std");
     auto* output = context.Output(0)->GetMutable<framework::Tensor>();
@@ -41,15 +41,14 @@ class GaussianRandomOpKernel<platform::CPUPlace, T>
 
 class GaussianRandomOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(
-      const std::vector<const framework::Tensor*>& inputs,
-      const std::vector<framework::Tensor*>& outputs) const override {
+  void InferShape(const framework::InferShapeContext& ctx) const override {
     PADDLE_ENFORCE(inputs.size() == 0, "Input size of RandomOp must be zero.");
     PADDLE_ENFORCE(outputs.size() == 1, "Output size of RandomOp must be one.");
     PADDLE_ENFORCE(outputs[0] != nullptr,
                    "Outputs of RandomOp must all be set.");
-    outputs[0]->Resize(
-        framework::make_ddim(this->GetAttr<std::vector<int>>("shape")));
+    auto* tensor = ctx.Output<Tensor>(0);
+    auto dims = GetAttr(std::vector<int>("shape"));
+    tensor->Resize(framework::make_ddim(dims));
   }
 };
 

From 7e3747131899685b7b058241576a1e2e96f172ea Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Mon, 7 Aug 2017 14:26:47 -0700
Subject: [PATCH 24/51] modify

---
 paddle/operators/CMakeLists.txt              |   5 +-
 paddle/operators/{gather_func.h => gather.h} |   0
 paddle/operators/gather_func.cc              |  19 ---
 paddle/operators/gather_test.cc              |   4 +-
 paddle/operators/scatter_func.h              | 116 -------------------
 5 files changed, 3 insertions(+), 141 deletions(-)
 rename paddle/operators/{gather_func.h => gather.h} (100%)
 delete mode 100644 paddle/operators/gather_func.cc
 delete mode 100644 paddle/operators/scatter_func.h

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 10922892ca..a2284fc8f0 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -41,10 +41,7 @@ function(op_library TARGET)
     endif()
 endfunction()
 
-op_library(gather SRCS gather_func.cc)
-cc_test(gather_test SRCS gather_test.cc DEPS gather)
-
-op_library(scatter SRCS scatter_func.cc)
+cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 
 op_library(add_op SRCS add_op.cc add_op.cu)
 cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
diff --git a/paddle/operators/gather_func.h b/paddle/operators/gather.h
similarity index 100%
rename from paddle/operators/gather_func.h
rename to paddle/operators/gather.h
diff --git a/paddle/operators/gather_func.cc b/paddle/operators/gather_func.cc
deleted file mode 100644
index a6b2331f32..0000000000
--- a/paddle/operators/gather_func.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/gather_func.h"
-#include <cstring>
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
index 6f220b133b..5d84b7b5f3 100644
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/operators/gather.h"
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
-#include "paddle/operators/gather_func.h"
 #include "paddle/platform/place.h"
 
 #include <gtest/gtest.h>
 #include <iostream>
 #include <string>
 
-TEST(_abc_, GatherData) {
+TEST(Gather, GatherData) {
   using namespace paddle::framework;
   using namespace paddle::platform;
   using namespace paddle::operators;
diff --git a/paddle/operators/scatter_func.h b/paddle/operators/scatter_func.h
deleted file mode 100644
index 53b260170f..0000000000
--- a/paddle/operators/scatter_func.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstring>
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
-
-/**
- * Return a updated tensor from source tensor, scattered according to index:
- * dst[i] += src[index[i]]
- * input[src]: type-T source Tensor
- * input[index]: type-int index Tensor (1-D)
- * return: output tensor
- */
-template <typename Place, typename T>
-void ScatterUpdate(Tensor* src, Tensor* dst, Tensor* index) {
-  // Source shape
-  auto src_dims = src->dims();
-  auto dst_dims = dst->dims();
-  DDim output_dims(dims_src);
-
-  // check src shape and dst shape should match
-  for (size_t i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
-
-  int index_size = index->dims()[0];
-
-  /* slice size */
-  int slice_size = 1;
-  for (size_t i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  if (place == CPUPlace()) {
-    // init
-    output = new_tensor.mutable_data<T>(output_dims, CPUPlace());
-    CPUScatterUpdate(
-        src->data(), index->data(), slice_size, new_tensor->mutable_data());
-
-  } else {  // GPU
-    // init
-    output = new_tensor.mutable_data<T>(output_dims, GPUPlace());
-    /* how to specialize device??*/
-    GPUScatterUpdate(
-        d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
-  }
-}
-
-/* Implementation of CPU copy */
-template <typename T>
-void CPUScatterUpdate(const T* src,
-                      const int* index,
-                      const int slice_size,
-                      const int index_size,
-                      T* output) {
-  // const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (size_t i = 0; i < index_size; ++i) {
-    int index_ = index[i];
-    math::vAdd<T>(slice_size,
-                  src + index_ * slice_bytes,
-                  output + i * slice_bytes,
-                  output + i * slice_bytes);
-  }
-}
-
-/* Implementation of GPU scatter:
-   I suppose the GPUDevice& d, contains gpu_id and thread_id
-   d = cuda_stream(gpu_id_, stream_id_);
-*/
-template <typename T>
-void GPUScatterUpdate(const GPUDevice& d,
-                      const T* src,
-                      const int* index,
-                      const int slice_size,
-                      const int index_size,
-                      T* output) {
-  int block_count = slice_size * index_size;
-  int thread_per_block = 1024;
-
-  ScatterOpKernel<T><<<block_count, thread_per_block, 0, d.stream()>>>(
-      src, index, output, slice_size, indices_size, slice_size, out_size);
-}
-
-template <typename T>
-__global__ void ScatterOpKernel(const T* params,
-                                const int* indices,
-                                T* out,
-                                int64 indices_size,
-                                int64 slice_size,
-                                int64 out_size) {
-  /* I suppose we have the following macro,
-     which I strongly suggest that we should put in cuda:
-  #define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-  */
-  CUDA_1D_KERNEL_LOOP(i, out_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    int scatter_i = indices[indices_i];
-    int params_i = scatter_i * slice_size + slice_i;
-    out[i] += *(params + params_i);
-  }
-}

From e2c08d286fe8f230cb20ab5b6de6601d93feb82c Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 8 Aug 2017 16:21:21 +0800
Subject: [PATCH 25/51] "keep style same with uniform operators"

---
 paddle/operators/gaussian_random_op.cc | 69 +++++++++++++-------------
 paddle/operators/gaussian_random_op.cu | 43 +++++++++-------
 paddle/operators/gaussian_random_op.h  | 17 -------
 3 files changed, 60 insertions(+), 69 deletions(-)
 delete mode 100644 paddle/operators/gaussian_random_op.h

diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index d7ced6b526..b0b68ff36d 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -12,42 +12,42 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/gaussian_random_op.h"
-#include "glog/logging.h"
+#include <random>
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-class GaussianRandomOpKernel<platform::CPUPlace, T>
-    : public framework::OpKernel {
+class GaussianRandomKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto mean = context.op_.GetAttr<T>("mean");
-    auto std = context.op_.GetAttr<T>("std");
-    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
-    T* r = output->mutable_data<T>(context.GetPlace());
-    auto ctx =
-        static_cast<const platform::CPUDeviceContext*>(context.device_context_);
-    // generator need to modify context
-    auto g = const_cast<platform::CPUDeviceContext*>(ctx)->RandGenerator();
+    T mean = static_cast<T>(context.op_.GetAttr<T>("mean"));
+    T std = static_cast<T>(context.op_.GetAttr<T>("std"));
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    // TODO(dzh): attribute does not support unsigned int.
+    // And we need a global random seed configuration.
+    int seed = context.op_.GetAttr<int>("seed");
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    std::mt19937 g(seed);
     std::normal_distribution<T> distribution(mean, std);
-    for (int i = 0; i < framework::product(output->dims()); ++i) {
-      r[i] = distribution(g);
+    for (int i = 0; i < framework::product(tensor->dims()); ++i) {
+      data[i] = distribution(g);
     }
   }
 };
 
 class GaussianRandomOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE(inputs.size() == 0, "Input size of RandomOp must be zero.");
-    PADDLE_ENFORCE(outputs.size() == 1, "Output size of RandomOp must be one.");
-    PADDLE_ENFORCE(outputs[0] != nullptr,
-                   "Outputs of RandomOp must all be set.");
-    auto* tensor = ctx.Output<Tensor>(0);
-    auto dims = GetAttr(std::vector<int>("shape"));
+  void InferShape(const framework::InferShapeContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    auto dims = GetAttr<std::vector<int>>("dims");
+    PADDLE_ENFORCE(dims.size() > 0UL,
+                   "dims can be one int or array. dims must be set.");
     tensor->Resize(framework::make_ddim(dims));
   }
 };
@@ -57,26 +57,25 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
   GaussianRandomOpMaker(framework::OpProto* proto,
                         framework::OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("shape", "The shape of matrix to be randomized");
-    AddAttr<float>("mean", "mean value of random.").SetDefault(.0);
-    AddAttr<float>("std", "minimum value of random value")
-        .SetDefault(1.0)
-        .LargerThan(.0);
     AddOutput("Out", "output matrix of random op");
     AddComment(R"DOC(
-GaussianRandom Operator fill a matrix in normal distribution.
-The eqution : Out = GaussianRandom(Shape=(d0, d1, ...), Dtype, mean, std)
+GaussianRandom operator.
+Use to initialize tensor with gaussian random generator.
 )DOC");
+
+    AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
+    AddAttr<float>("mean", "mean value of random.").SetDefault(.0f);
+    AddAttr<float>("std", "minimum value of random value.").SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed of generator."
+                 "0 means use system wide seed")
+        .SetDefault(0);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(gaussian_random, paddle::operators::GaussianRandomOp,
-            paddle::operators::GaussianRandomOpMaker);
-
-typedef paddle::operators::GaussianRandomOpKernel<paddle::platform::CPUPlace,
-                                                  float>
-    GaussianRandomOpKernel_CPU_float;
-REGISTER_OP_CPU_KERNEL(gaussian_random, GaussianRandomOpKernel_CPU_float);
+namespace ops = paddle::operators;
+REGISTER_OP(gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index afede378f6..a408d2aa79 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -1,30 +1,39 @@
+#include <memory>
+#include <random>
+#include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/gpu_info.h"
+
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/guassian_random_op.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-class GaussianRandomOpKernel<platform::GPUPlace, T>
-    : public framework::OpKernel {
+class GaussianRandomKernel : public framework::OpKernel {
  public:
-  void Compute(const framework::KernelContext& context) const override {
-    auto mean = context.op_.GetAttr<T>("mean");
-    auto std = context.op_.GetAttr<T>("std");
-    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
-    T* r = output->mutable_data<T>(context.GetPlace());
-    auto ctx =
-        static_cast<const platform::GPUDeviceContext*>(context.device_context_);
-    // generator need to modify context
-    auto g = const_cast<platform::GPUDeviceContext*>(ctx)->RandGenerator();
-    curandGenerateNormal(g, r, framework::product(output->dims()), mean, std);
+  void Compute(const framework::ExecutionContext& context) const override {
+    T mean = static_cast<T>(context.op_.GetAttr<T>("mean"));
+    T std = static_cast<T>(context.op_.GetAttr<T>("std"));
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    int seed = context.op_.GetAttr<int>("seed");
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    curandGenerator_t g;
+    PADDLE_ENFORCE(platform::dynload::curandCreateGenerator(
+        &g, CURAND_RNG_PSEUDO_DEFAULT));
+    PADDLE_ENFORCE(
+        platform::dynload::curandSetPseudoRandomGeneratorSeed(g, seed));
+    // auto g = const_cast<platform::GPUDeviceContext*>(ctx)->RandGenerator();
+    curandGenerateNormal(g, data, framework::product(tensor->dims()), mean,
+                         std);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-typedef paddle::operators::GaussianRandomOpKernel<paddle::platform::GPUPlace,
-                                                  float>
-    RandomOpKernel_GPU_float;
-REGISTER_OP_GPU_KERNEL(gaussian_random, GaussianRandomOpKernel_GPU_float);
\ No newline at end of file
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
\ No newline at end of file
diff --git a/paddle/operators/gaussian_random_op.h b/paddle/operators/gaussian_random_op.h
deleted file mode 100644
index b463a171d9..0000000000
--- a/paddle/operators/gaussian_random_op.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-#include <random>
-#include "glog/logging.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Place, typename T>
-class GaussianRandomOpKernel : public framework::OpKernel {
-public:
-  void Compute(const framework::KernelContext& context) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle

From 52d2ebdaef66f980c8ecb4878d41da6b44467115 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 8 Aug 2017 16:40:21 +0800
Subject: [PATCH 26/51] "test gaussian random in python side"

---
 paddle/framework/CMakeLists.txt               |  1 +
 paddle/framework/pybind.cc                    |  1 +
 .../paddle/v2/framework/tests/CMakeLists.txt  |  2 ++
 .../tests/test_gaussian_random_op.py          | 33 +++++++++++++++++++
 .../v2/framework/tests/test_random_op.py      | 29 ----------------
 5 files changed, 37 insertions(+), 29 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_gaussian_random_op.py
 delete mode 100644 python/paddle/v2/framework/tests/test_random_op.py

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 1db042c6fc..9b96a59189 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -43,4 +43,5 @@ cc_library(paddle_pybind SHARED
 	add_op
 	mean_op
 	cross_entropy_op
+  gaussian_random_op
 	recurrent_op)
diff --git a/paddle/framework/pybind.cc b/paddle/framework/pybind.cc
index cbb86c4195..85548e3e91 100644
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -41,6 +41,7 @@ USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP_WITHOUT_KERNEL(recurrent_op);
+USE_OP(gaussian_random);
 namespace paddle {
 namespace framework {
 template <typename ClassType>
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 7eec376788..5a89984118 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -21,3 +21,5 @@ py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
 
 py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
 py_test(test_op_creation_methods SRCS test_op_creation_methods.py)
+
+py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
new file mode 100644
index 0000000000..020e69fe14
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -0,0 +1,33 @@
+import unittest
+import paddle.v2.framework.core as core
+import paddle.v2.framework.op as Operator
+import numpy
+
+
+class GaussianRandomTest(unittest.TestCase):
+    def test_cpu(self):
+        self.test_gaussian_random(place=core.CPUPlace())
+
+    def test_gpu(self):
+        self.test_gaussian_random(place=core.GPUPlace(0))
+
+    def test_gaussian_random(self, place):
+        scope = core.Scope()
+        scope.new_var("Out").get_tensor()
+        op = Operator(
+            "gaussian_random",
+            Out="Out",
+            dims=[1000, 784],
+            mean=.0,
+            std=1.,
+            seed=0)
+        op.infer_shape(scope)
+        context = core.DeviceContext.create(place)
+        op.run(scope, context)
+        tensor = numpy.array(scope.find_var("Out").get_tensor())
+        self.assertAlmostEqual(numpy.mean(tensor), .0, places=3)
+        self.assertAlmostEqual(numpy.std(tensor), 1., places=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_random_op.py b/python/paddle/v2/framework/tests/test_random_op.py
deleted file mode 100644
index d3474880d3..0000000000
--- a/python/paddle/v2/framework/tests/test_random_op.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import unittest
-import paddle.v2.framework.create_op_creation_methods as creation
-import paddle.v2.framework.core as core
-from op_test_util import OpTestMeta
-import numpy
-
-
-class TestRandomOp(unittest.TestCase):
-    def test_random(self):
-        scope = core.Scope(None)
-        # Out = scope.create_var("Out")
-        op = creation.op_creations.gaussian_random(
-            shape=[1000, 1000], mean=5.0, std=1.0, Out="Out")
-        for out in op.outputs():
-            if scope.get_var(out) is None:
-                scope.create_var(out).get_tensor()
-
-        tensor = scope.get_var("Out").get_tensor()
-        op.infer_shape(scope)
-        self.assertEqual([1000, 1000], tensor.shape())
-        ctx = core.DeviceContext.cpu_context()
-        op.run(scope, ctx)
-        tensor_array = numpy.array(tensor)
-        self.assertAlmostEqual(numpy.mean(tensor_array), 5.0, places=3)
-        self.assertAlmostEqual(numpy.std(tensor_array), 1.0, places=3)
-
-
-if __name__ == '__main__':
-    unittest.main()

From 555af4d0f6a02c88f9503cb6ffdd5c5e66e2e9b4 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 8 Aug 2017 17:21:04 +0800
Subject: [PATCH 27/51] "format code"

---
 paddle/framework/CMakeLists.txt        |  6 +++---
 paddle/operators/gaussian_random_op.cu | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 55d4bc428e..1ebab6b8ab 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -38,9 +38,9 @@ cc_test(backward_test SRCS backward_test.cc DEPS backward)
 
 if(WITH_PYTHON)
 cc_library(paddle_pybind SHARED
-    SRCS pybind.cc
-    DEPS pybind python backward
-	fc_op
+  SRCS pybind.cc
+  DEPS pybind python backward
+  fc_op
 	sgd_op
 	add_op
 	mean_op
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index a408d2aa79..164753f946 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <memory>
 #include <random>
 #include "paddle/platform/dynload/curand.h"

From ba3f3f56893c7d7ad6d8cf8de019ca82cbf53ead Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 8 Aug 2017 18:50:08 +0800
Subject: [PATCH 28/51] Ignore compile generated files for git

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index c84b2fc8c7..9622ab78e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,4 +24,5 @@ cmake-build-*
 python/paddle/v2/framework/core.so
 CMakeFiles
 cmake_install.cmake
-
+paddle/.timestamp
+python/paddlepaddle.egg-info/

From d98e299d3b9977819afbf9db53a97c5c0bbbaa68 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 9 Aug 2017 00:16:55 +0800
Subject: [PATCH 29/51] "keep same with uniform random op"

---
 python/paddle/v2/framework/tests/test_gaussian_random_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
index 020e69fe14..0ff8c89a14 100644
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -1,6 +1,6 @@
 import unittest
 import paddle.v2.framework.core as core
-import paddle.v2.framework.op as Operator
+from paddle.v2.framework.op import Operator
 import numpy
 
 

From 70825506d1561d53c1efdfc5a50ef8cb8a4c4f9f Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 9 Aug 2017 00:28:04 +0800
Subject: [PATCH 30/51] "remove context random seeding "

---
 paddle/platform/device_context.cc |  7 ++-----
 paddle/platform/device_context.h  | 14 ++------------
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index f80c36b5b2..a928e09778 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -21,12 +21,10 @@ Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
 }
 
 CPUDeviceContext::CPUDeviceContext() {
-  random_seed_ = std::chrono::system_clock::now().time_since_epoch().count();
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
 
 CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
-  random_seed_ = std::chrono::system_clock::now().time_since_epoch().count();
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
 
@@ -44,7 +42,6 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
 }
 
 CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
-  random_seed_ = std::chrono::system_clock::now().time_since_epoch().count();
   SetDeviceId(place_.device);
   // TODO(qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
   // here will cause segment fault. We must implement a class derived from
@@ -111,8 +108,8 @@ curandGenerator_t CUDADeviceContext::curand_generator() {
     SetDeviceId(place_.device);
     PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_,
                                                   CURAND_RNG_PSEUDO_DEFAULT));
-    PADDLE_ENFORCE(dynload::curandSetPseudoRandomGeneratorSeed(
-        curand_generator_, random_seed_));
+    PADDLE_ENFORCE(
+        dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
   }
   return curand_generator_;
 }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 450213c34a..08b5b2cff9 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
-#include <chrono>
 #include <memory>
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -40,7 +39,6 @@ class DeviceContext {
 
 class CPUDeviceContext : public DeviceContext {
  public:
-  typedef std::mt19937 random_generator_type;
   CPUDeviceContext();
   explicit CPUDeviceContext(CPUPlace);
   virtual ~CPUDeviceContext() {}
@@ -49,16 +47,7 @@ class CPUDeviceContext : public DeviceContext {
 
   Place GetPlace() const override;
 
-  random_generator_type& RandGenerator() {
-    if (!rand_generator_) {
-      rand_generator_.reset(new random_generator_type(random_seed_));
-    }
-    return *rand_generator_.get();
-  }
-
  private:
-  unsigned random_seed_;
-  std::unique_ptr<random_generator_type> rand_generator_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
@@ -97,7 +86,8 @@ class CUDADeviceContext : public DeviceContext {
   std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
 
  private:
-  unsigned random_seed_;
+  uint64_t seed_;
+
   // clang-format off
   cudnnHandle_t     cudnn_handle_     = nullptr;
   cublasHandle_t    cublas_handle_    = nullptr;

From b8461c79fc72d3fc3ed8d0815058e12385eaa4d1 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Tue, 8 Aug 2017 00:00:06 +0000
Subject: [PATCH 31/51] implement init parameters selection with etcd

---
 go/glide.lock                                |  10 +-
 go/master/service_test.go                    |   4 +-
 go/pserver/client/c/cclient.go               |  20 ++-
 go/pserver/client/client.go                  |  10 +-
 go/pserver/client/client_test.go             |  14 +-
 go/pserver/client/etcd_client.go             | 153 ++++++++++++++++---
 go/pserver/client/etcd_client_test.go        |  97 ++++++++++++
 paddle/trainer/NewRemoteParameterUpdater.cpp |   2 +-
 8 files changed, 269 insertions(+), 41 deletions(-)
 create mode 100644 go/pserver/client/etcd_client_test.go

diff --git a/go/glide.lock b/go/glide.lock
index be1fb24d77..1ecdd21752 100644
--- a/go/glide.lock
+++ b/go/glide.lock
@@ -1,5 +1,5 @@
 hash: 1b9b07408ca7fac27a374dc2ccd2433e4bff090484008a037df967284949a582
-updated: 2017-08-03T21:46:51.744995189Z
+updated: 2017-08-07T23:37:48.867469328Z
 imports:
 - name: github.com/beorn7/perks
   version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
@@ -10,7 +10,7 @@ imports:
 - name: github.com/cockroachdb/cmux
   version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
 - name: github.com/coreos/etcd
-  version: c31bec0f29facff13f7c3e3d948e55dd6689ed42
+  version: d0d1a87aa96ae14914751d42264262cb69eda170
   subpackages:
   - alarm
   - auth
@@ -24,6 +24,7 @@ imports:
   - error
   - etcdserver
   - etcdserver/api
+  - etcdserver/api/etcdhttp
   - etcdserver/api/v2http
   - etcdserver/api/v2http/httptypes
   - etcdserver/api/v3client
@@ -210,11 +211,6 @@ testImports:
   version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
   subpackages:
   - spew
-- name: github.com/docker/docker
-  version: b6d164e6c46d8115b146e4c3ac93784e9ef8b49e
-  subpackages:
-  - pkg/ioutils
-  - pkg/longpath
 - name: github.com/pmezard/go-difflib
   version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
   subpackages:
diff --git a/go/master/service_test.go b/go/master/service_test.go
index 5f91910ecc..87ce9c9043 100644
--- a/go/master/service_test.go
+++ b/go/master/service_test.go
@@ -1,6 +1,7 @@
 package master_test
 
 import (
+	"io/ioutil"
 	"os"
 	"testing"
 	"time"
@@ -8,13 +9,12 @@ import (
 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/coreos/etcd/clientv3"
 	"github.com/coreos/etcd/embed"
-	"github.com/docker/docker/pkg/ioutils"
 	"github.com/stretchr/testify/assert"
 )
 
 func TestNewServiceWithEtcd(t *testing.T) {
 	// setup an embed etcd server
-	etcdDir, err := ioutils.TempDir("", "")
+	etcdDir, err := ioutil.TempDir("", "")
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
index 14ad077455..a49cd01522 100644
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -90,8 +90,12 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 
 type selector bool
 
-func (s selector) Select() bool {
-	return bool(s)
+func (s selector) Select() (bool, error) {
+	return bool(s), nil
+}
+
+func (s selector) Done() error {
+	return nil
 }
 
 type lister []client.Server
@@ -114,11 +118,10 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }
 
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
-	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char) C.paddle_pserver_client {
 	addr := C.GoString(etcdEndpoints)
 	etcdClient := client.NewEtcd(addr)
-	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
+	c := client.NewClient(etcdClient, etcdClient.Desired(), etcdClient)
 	return add(c)
 }
 
@@ -136,7 +139,12 @@ func paddle_pserver_client_release(client C.paddle_pserver_client) {
 //export paddle_begin_init_params
 func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
-	if selected := c.BeginInitParams(); selected {
+	selected, err := c.BeginInitParams()
+	if err != nil {
+		panic(err)
+	}
+
+	if selected {
 		return 1
 	}
 	return 0
diff --git a/go/pserver/client/client.go b/go/pserver/client/client.go
index 15adda4735..20d91e7703 100644
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -27,9 +27,13 @@ import (
 
 // TODO(helin): add RPC call retry logic
 
-// Selector selects if the client should initialize parameter servers.
+// Selector selects if the client should initialize parameters and
+// reports the initialization process done.
 type Selector interface {
-	Select() bool
+	// Select selects if the client should initialize parameter servers.
+	Select() (bool, error)
+	// Done indicates the initialization process is done.
+	Done() error
 }
 
 // Server is the identification of a parameter Server.
@@ -115,7 +119,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 // servers. Other trainers will be blocked until the initialization is
 // done, and they need to get the initialized parameters from
 // parameter servers using GetParams.
-func (c *Client) BeginInitParams() bool {
+func (c *Client) BeginInitParams() (bool, error) {
 	return c.sel.Select()
 }
 
diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index 1243ebd683..c3d88e926d 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -124,8 +124,12 @@ func initEtcdClient() {
 
 type selector bool
 
-func (s selector) Select() bool {
-	return bool(s)
+func (s selector) Select() (bool, error) {
+	return bool(s), nil
+}
+
+func (s selector) Done() error {
+	return nil
 }
 
 type lister []client.Server
@@ -135,7 +139,11 @@ func (l lister) List() []client.Server {
 }
 
 func testClient(t *testing.T, c *client.Client) {
-	selected := c.BeginInitParams()
+	selected, err := c.BeginInitParams()
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	if !selected {
 		t.Fatal("should be selected.")
 	}
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
index 977ae5af37..f9071caaa8 100644
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -16,53 +16,60 @@ package client
 
 import (
 	"context"
+	"errors"
+	"fmt"
 	"strconv"
 	"strings"
 	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
 	log "github.com/sirupsen/logrus"
 )
 
 const (
 	defaultEtcdTimeout time.Duration = 5 * time.Second
+
+	initLockPath = "/init_ps/lock"
+	initDonePath = "/init_ps/done"
+	initDoneVal  = "1"
 )
 
-// EtcdClient is used by pserver client that is a part of trainer process.
+// Etcd is used by pserver client that is a part of trainer process.
 // TODO:
-// 1. add watcher to watch the change state of pservers)
-// 1. add etcd lock)
-type EtcdClient struct {
+// 1. add watcher to watch the change state of pservers.
+type Etcd struct {
 	client    *clientv3.Client
 	timeout   time.Duration
 	endpoints []string
+	lock      *concurrency.Mutex
 }
 
 // Desired read ps desired number from etcd.
-func (p *EtcdClient) Desired() int {
+func (e *Etcd) Desired() int {
 	var psDesired int
 	for {
-		ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
-		resp, err := p.client.Get(ctx, pserver.PsDesired)
+		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+		resp, err := e.client.Get(ctx, pserver.PsDesired)
 		cancel()
 		if err != nil {
 			log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}
 
 		kvs := resp.Kvs
 		if len(kvs) == 0 {
 			log.Infoln("Waiting for ps desired registered ...")
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}
 
 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
 			log.Errorf("psDesired %d invalid %v", psDesired, err)
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}
 
@@ -73,26 +80,26 @@ func (p *EtcdClient) Desired() int {
 }
 
 // List return the pserver list read from etcd.
-func (p *EtcdClient) List() []Server {
-	psDesired := p.Desired()
+func (e *Etcd) List() []Server {
+	psDesired := e.Desired()
 
 	servers := make([]Server, psDesired)
 	for {
 		for i := 0; i < psDesired; i++ {
-			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+			ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
-			resp, err := p.client.Get(ctx, psKey)
+			resp, err := e.client.Get(ctx, psKey)
 			cancel()
 			if err != nil {
 				log.Infof("Get psKey= %s error, %v", psKey, err)
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
 			kvs := resp.Kvs
 			if len(kvs) == 0 {
 				log.Infof("Waiting for ps addr registered ...")
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
 
@@ -100,7 +107,7 @@ func (p *EtcdClient) List() []Server {
 			// TODO(Longfei) check the ps address
 			if psAddr == "" {
 				log.Infof("Get psKey = %s, psAddr is empty", psKey)
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
 			log.Debugf("got value (%s) for key: %s", psAddr, psKey)
@@ -113,7 +120,7 @@ func (p *EtcdClient) List() []Server {
 }
 
 // NewEtcd create a etcd client to return the state of pserver on etcd.
-func NewEtcd(endpoints string) *EtcdClient {
+func NewEtcd(endpoints string) *Etcd {
 	ep := strings.Split(endpoints, ",")
 	var cli *clientv3.Client
 	var err error
@@ -130,10 +137,118 @@ func NewEtcd(endpoints string) *EtcdClient {
 		break
 	}
 	log.Infof("Connected to etcd: %s\n", endpoints)
-	client := &EtcdClient{
+	client := &Etcd{
 		client:    cli,
 		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client
 }
+
+// Select indicates if the current trainer is selected to initialize
+// the pserver parameters.
+func (e *Etcd) Select() (bool, error) {
+	sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
+	if err != nil {
+		return false, err
+	}
+
+	lock := concurrency.NewMutex(sess, initLockPath)
+	log.Infof("Trying to acquire lock at %s.", initLockPath)
+	// Do not use timeout context here, since we don't know how
+	// long does it take for other trainers to initialize the
+	// parameters.
+	err = lock.Lock(context.Background())
+	if err != nil {
+		return false, err
+	}
+	log.Infof("Successfully acquired lock at %s.", initLockPath)
+
+	get := clientv3.OpGet(initDonePath)
+	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+	tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
+	cancel()
+	if err != nil {
+		return false, err
+	}
+
+	if !tresp.Succeeded {
+		return false, errors.New("no longer the owner of the lock")
+	}
+
+	resp := tresp.Responses[0].GetResponseRange()
+
+	if len(resp.Kvs) == 0 {
+		// Key value not set, select current trainer.
+		e.lock = lock
+		log.Infoln("Trainer selected.")
+		return true, nil
+	}
+
+	if string(resp.Kvs[0].Value) == initDoneVal {
+		log.Infoln("Initialization is already done.")
+		ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
+		err = lock.Unlock(ctx)
+		cancel()
+		if err != nil {
+			log.Errorln(err)
+		}
+		return false, nil
+	}
+
+	return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
+}
+
+// Done indicates the parameter initialization process is done.
+func (e *Etcd) Done() error {
+	if e.lock == nil {
+		return errors.New("lock is nil, Done called unexpectedly")
+	}
+
+	put := clientv3.OpPut(initDonePath, initDoneVal)
+	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+	tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
+	cancel()
+	if err != nil {
+		return err
+	}
+
+	if !tresp.Succeeded {
+		return errors.New("no longer the owner of the lock")
+	}
+
+	ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
+	err = e.lock.Unlock(ctx)
+	cancel()
+	if err != nil {
+		log.Errorln(err)
+	} else {
+		e.lock = nil
+	}
+
+	return nil
+}
+
+// Close closes the etcd client.
+func (e *Etcd) Close() error {
+	var err error
+	if e.lock != nil {
+		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+		err = e.lock.Unlock(ctx)
+		cancel()
+		if err == nil {
+			e.lock = nil
+		}
+	}
+
+	cErr := e.client.Close()
+	if cErr != nil {
+		if err != nil {
+			log.Errorln(cErr)
+			return err
+		}
+		return cErr
+	}
+
+	return err
+}
diff --git a/go/pserver/client/etcd_client_test.go b/go/pserver/client/etcd_client_test.go
new file mode 100644
index 0000000000..9daeb97d4c
--- /dev/null
+++ b/go/pserver/client/etcd_client_test.go
@@ -0,0 +1,97 @@
+package client_test
+
+import (
+	"io/ioutil"
+	"os"
+	"sync"
+	"testing"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
+	"github.com/coreos/etcd/embed"
+)
+
+func TestSelector(t *testing.T) {
+	etcdDir, err := ioutil.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg := embed.NewConfig()
+	cfg.Dir = etcdDir
+	e, err := embed.StartEtcd(cfg)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	defer func() {
+		e.Close()
+		if err := os.RemoveAll(etcdDir); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	<-e.Server.ReadyNotify()
+
+	var mu sync.Mutex
+	selectedCount := 0
+	var wg sync.WaitGroup
+	selectAndDone := func(c *client.Etcd) {
+		defer wg.Done()
+
+		selected, err := c.Select()
+		if err != nil {
+			panic(err)
+		}
+
+		if selected {
+			mu.Lock()
+			selectedCount++
+			mu.Unlock()
+			err = c.Done()
+			if err != nil {
+				t.Fatal(err)
+			}
+		}
+	}
+
+	c0 := client.NewEtcd("127.0.0.1:2379")
+	c1 := client.NewEtcd("127.0.0.1:2379")
+	c2 := client.NewEtcd("127.0.0.1:2379")
+	c3 := client.NewEtcd("127.0.0.1:2379")
+	wg.Add(3)
+	go selectAndDone(c0)
+	go selectAndDone(c1)
+	go selectAndDone(c2)
+	wg.Wait()
+
+	// simulate trainer crashed and restarted after the
+	// initialization process.
+	wg.Add(1)
+	go selectAndDone(c3)
+	wg.Wait()
+
+	mu.Lock()
+	if selectedCount != 1 {
+		t.Fatal("selected count wrong:", selectedCount)
+	}
+	mu.Unlock()
+
+	err = c0.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c1.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c2.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c3.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index e1558e3fdf..31bbab3aef 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -51,7 +51,7 @@ void NewRemoteParameterUpdater::init(
   // create parameter server client.
   if (useEtcd_) {
     parameterClient_ = paddle_new_etcd_pserver_client(
-        (char *)pserverSpec_.c_str(), FLAGS_trainer_id == 0);
+        (char *)pserverSpec_.c_str());
   } else {
     parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
                                                  FLAGS_trainer_id == 0);

From 34add153def8470afaae78f8a722d526c559f5b1 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Tue, 8 Aug 2017 14:06:20 -0400
Subject: [PATCH 32/51] fix style check

---
 paddle/trainer/NewRemoteParameterUpdater.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index 31bbab3aef..af1dceed02 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -50,8 +50,8 @@ void NewRemoteParameterUpdater::init(
 
   // create parameter server client.
   if (useEtcd_) {
-    parameterClient_ = paddle_new_etcd_pserver_client(
-        (char *)pserverSpec_.c_str());
+    parameterClient_ =
+        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
   } else {
     parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
                                                  FLAGS_trainer_id == 0);

From f64539bef9acbf18cc28083499fb2c46daf646c7 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Tue, 8 Aug 2017 22:20:32 +0000
Subject: [PATCH 33/51] use random port for embed etcd to avoid port collision

---
 go/master/service_test.go             | 20 ++++++++++++--------
 go/pserver/client/etcd_client_test.go | 17 +++++++++++++----
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/go/master/service_test.go b/go/master/service_test.go
index 87ce9c9043..2d00c22d6f 100644
--- a/go/master/service_test.go
+++ b/go/master/service_test.go
@@ -2,7 +2,9 @@ package master_test
 
 import (
 	"io/ioutil"
+	"net/url"
 	"os"
+	"strings"
 	"testing"
 	"time"
 
@@ -19,6 +21,10 @@ func TestNewServiceWithEtcd(t *testing.T) {
 		t.Fatal(err)
 	}
 	cfg := embed.NewConfig()
+	lpurl, _ := url.Parse("http://localhost:0")
+	lcurl, _ := url.Parse("http://localhost:0")
+	cfg.LPUrls = []url.URL{*lpurl}
+	cfg.LCUrls = []url.URL{*lcurl}
 	cfg.Dir = etcdDir
 	e, err := embed.StartEtcd(cfg)
 	if err != nil {
@@ -30,15 +36,13 @@ func TestNewServiceWithEtcd(t *testing.T) {
 			t.Fatal(err)
 		}
 	}()
-	select {
-	case <-e.Server.ReadyNotify():
-		t.Log("Server is ready!")
-	case <-time.After(60 * time.Second):
-		e.Server.Stop() // trigger a shutdown
-		t.Fatal("Server took too long to start!")
-	}
 
-	ep := []string{"127.0.0.1:2379"}
+	<-e.Server.ReadyNotify()
+
+	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
+	endpoint := "127.0.0.1:" + port
+
+	ep := []string{endpoint}
 	masterAddr := "127.0.0.1:3306"
 	store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30)
 	if err != nil {
diff --git a/go/pserver/client/etcd_client_test.go b/go/pserver/client/etcd_client_test.go
index 9daeb97d4c..08742433e7 100644
--- a/go/pserver/client/etcd_client_test.go
+++ b/go/pserver/client/etcd_client_test.go
@@ -2,7 +2,9 @@ package client_test
 
 import (
 	"io/ioutil"
+	"net/url"
 	"os"
+	"strings"
 	"sync"
 	"testing"
 
@@ -16,6 +18,10 @@ func TestSelector(t *testing.T) {
 		t.Fatal(err)
 	}
 	cfg := embed.NewConfig()
+	lpurl, _ := url.Parse("http://localhost:0")
+	lcurl, _ := url.Parse("http://localhost:0")
+	cfg.LPUrls = []url.URL{*lpurl}
+	cfg.LCUrls = []url.URL{*lcurl}
 	cfg.Dir = etcdDir
 	e, err := embed.StartEtcd(cfg)
 	if err != nil {
@@ -31,6 +37,9 @@ func TestSelector(t *testing.T) {
 
 	<-e.Server.ReadyNotify()
 
+	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
+	endpoint := "127.0.0.1:" + port
+
 	var mu sync.Mutex
 	selectedCount := 0
 	var wg sync.WaitGroup
@@ -53,10 +62,10 @@ func TestSelector(t *testing.T) {
 		}
 	}
 
-	c0 := client.NewEtcd("127.0.0.1:2379")
-	c1 := client.NewEtcd("127.0.0.1:2379")
-	c2 := client.NewEtcd("127.0.0.1:2379")
-	c3 := client.NewEtcd("127.0.0.1:2379")
+	c0 := client.NewEtcd(endpoint)
+	c1 := client.NewEtcd(endpoint)
+	c2 := client.NewEtcd(endpoint)
+	c3 := client.NewEtcd(endpoint)
 	wg.Add(3)
 	go selectAndDone(c0)
 	go selectAndDone(c1)

From 6159f5db14b580fab0386fdbe258b26c892be257 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Tue, 8 Aug 2017 16:38:51 -0700
Subject: [PATCH 34/51] code style fix

---
 paddle/operators/gather.h | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
index 5adc1e6b17..8b02156545 100644
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -28,11 +28,8 @@ namespace operators {
 
 /* Implementation of CPU copy */
 template <typename T>
-void CPUGather(const T* params,
-               const int* indices,
-               const int slice_size,
-               const int index_size,
-               T* output) {
+void CPUGather(const T* params, const int* indices, const int slice_size,
+               const int index_size, T* output) {
   const size_t slice_bytes = slice_size * sizeof(T);
 
   for (size_t i = 0; i < index_size; ++i) {
@@ -47,11 +44,8 @@ void CPUGather(const T* params,
   d = cuda_stream(gpu_id_, stream_id_);
 */
 template <typename T>
-void GPUGather(const T* src,
-               const int* index,
-               const int slice_size,
-               const int index_size,
-               T* output);
+void GPUGather(const T* src, const int* index, const int slice_size,
+               const int index_size, T* output);
 
 /**
  * Return a new tensor from source tensor, gathered according to index
@@ -60,8 +54,7 @@ void GPUGather(const T* src,
  * return: output tensor
  */
 template <typename T>
-void Gather(const platform::Place& place,
-            const paddle::framework::Tensor* src,
+void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
             const paddle::framework::Tensor* index,
             paddle::framework::Tensor* output) {
   // check index of shape 1-D
@@ -78,10 +71,7 @@ void Gather(const platform::Place& place,
 
   // Gathering
   if (platform::is_cpu_place(place)) {
-    CPUGather<T>(src->data<T>(),
-                 index->data<int>(),
-                 slice_size,
-                 index_size,
+    CPUGather<T>(src->data<T>(), index->data<int>(), slice_size, index_size,
                  output->data<T>());
   } else {
     // init for GPU

From 54cda76fe8058daefa9f2cb24efe19b035162bf5 Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Tue, 8 Aug 2017 17:02:55 -0700
Subject: [PATCH 35/51] Refactorize enforece_test.cc

---
 Dockerfile                      |   2 +-
 paddle/platform/CMakeLists.txt  |   2 +-
 paddle/platform/enforce_test.cc | 144 +++++++++++---------------------
 3 files changed, 51 insertions(+), 97 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 06a3d89307..8ac123bf9c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,7 +28,7 @@ RUN apt-get update && \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-matplotlib gcc-4.8 g++-4.8 \
-    automake locales clang-format-3.8 swig doxygen cmake  \
+    automake locales clang-format swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     net-tools && \
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index bd77bb7daa..4154aad15c 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -8,7 +8,7 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
 
-cc_test(enforce_test SRCS enforce_test.cc)
+cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)
 
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
index 4dfb697546..5408fce558 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -13,6 +13,10 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/string/piece.h"
+
+using StringPiece = paddle::string::Piece;
+using paddle::string::HasPrefix;
 
 TEST(ENFORCE, OK) {
   PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
@@ -22,19 +26,15 @@ TEST(ENFORCE, OK) {
 }
 
 TEST(ENFORCE, FAILED) {
-  bool in_catch = false;
+  bool caught_exception = false;
   try {
     PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
   } catch (paddle::platform::EnforceNotMet error) {
-    // your error handling code here
-    in_catch = true;
-    std::string msg = "Enforce is not ok 123 at all";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all"));
   }
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE, NO_ARG_OK) {
@@ -47,41 +47,27 @@ TEST(ENFORCE, NO_ARG_OK) {
 
 TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
   int a = 2;
-  bool in_catch = false;
-
+  bool caught_exception = false;
   try {
     PADDLE_ENFORCE_EQ(a, 1 + 3);
-
   } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce a == 1 + 3 failed, 2 != 4";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4");
   }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
   int a = 2;
-  bool in_catch = false;
-
+  bool caught_exception = false;
   try {
     PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their");
-
   } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg =
-        "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    HasPrefix(StringPiece(error.what()),
+              "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match");
   }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_NE, OK) {
@@ -89,42 +75,32 @@ TEST(ENFORCE_NE, OK) {
   PADDLE_ENFORCE_NE(1.0, 2UL);
 }
 TEST(ENFORCE_NE, FAIL) {
-  bool in_catch = false;
+  bool caught_exception = false;
 
   try {
     // 2UL here to check data type compatible
     PADDLE_ENFORCE_NE(1.0, 1UL);
-
   } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce 1.0 != 1UL failed, 1.000000 == 1";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+                          "enforce 1.0 != 1UL failed, 1.000000 == 1"))
+        << error.what() << " does not have expected prefix";
   }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
 TEST(ENFORCE_GT, FAIL) {
-  bool in_catch = false;
-
+  bool caught_exception = false;
   try {
-    // 2UL here to check data type compatible
     PADDLE_ENFORCE_GT(1, 2UL);
 
   } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce 1 > 2UL failed, 1 <= 2";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
   }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_GE, OK) {
@@ -134,21 +110,16 @@ TEST(ENFORCE_GE, OK) {
   PADDLE_ENFORCE_GE(3.21, 2UL);
 }
 TEST(ENFORCE_GE, FAIL) {
-  bool in_catch = false;
-
+  bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GE(1, 2UL);
 
   } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce 1 >= 2UL failed, 1 < 2";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2"));
   }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_LE, OK) {
@@ -159,21 +130,16 @@ TEST(ENFORCE_LE, OK) {
   PADDLE_ENFORCE_LE(2UL, 3.2);
 }
 TEST(ENFORCE_LE, FAIL) {
-  bool in_catch = false;
-
+  bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GT(1, 2UL);
 
   } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce 1 > 2UL failed, 1 <= 2";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
   }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_LT, OK) {
@@ -182,21 +148,15 @@ TEST(ENFORCE_LT, OK) {
   PADDLE_ENFORCE_LT(2UL, 3);
 }
 TEST(ENFORCE_LT, FAIL) {
-  bool in_catch = false;
-
+  bool caught_exception = false;
   try {
     PADDLE_ENFORCE_LT(1UL, 0.12);
-
   } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce 1UL < 0.12 failed, 1 >= 0.12";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+                          "enforce 1UL < 0.12 failed, 1 >= 0.12"));
   }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_NOT_NULL, OK) {
@@ -205,20 +165,14 @@ TEST(ENFORCE_NOT_NULL, OK) {
   delete a;
 }
 TEST(ENFORCE_NOT_NULL, FAIL) {
-  bool in_catch = false;
-  int* a{nullptr};
-
+  bool caught_exception = false;
   try {
+    int* a = nullptr;
     PADDLE_ENFORCE_NOT_NULL(a);
 
   } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "a should not be null";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
   }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }

From ede02d7d795f6b3fb237c95276fb3c5749c7f216 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Wed, 9 Aug 2017 09:05:35 +0800
Subject: [PATCH 36/51] LODTensor (Level of details, or Level of sequences
 Tensor). (#3109)

* add lodtensor

* add reshape of lod

* add details

* rename Elements/Levels

* size_t and vector reserve

* add details

* add const& std::shared_ptr

* add lod_tensor_impl.h

* remove a shared_ptr
---
 paddle/framework/CMakeLists.txt        |   3 +
 paddle/framework/details/lod_tensor.cc |  62 ++++++++++
 paddle/framework/details/lod_tensor.h  |  46 +++++++
 paddle/framework/lod_tensor.cc         |  51 ++++++++
 paddle/framework/lod_tensor.h          | 145 ++++++++++++++++++++++
 paddle/framework/lod_tensor_impl.h     |  60 +++++++++
 paddle/framework/lod_tensor_test.cc    | 165 +++++++++++++++++++++++++
 paddle/framework/tensor.h              |   2 +
 paddle/framework/tensor_test.cc        |   2 +-
 9 files changed, 535 insertions(+), 1 deletion(-)
 create mode 100644 paddle/framework/details/lod_tensor.cc
 create mode 100644 paddle/framework/details/lod_tensor.h
 create mode 100644 paddle/framework/lod_tensor.cc
 create mode 100644 paddle/framework/lod_tensor.h
 create mode 100644 paddle/framework/lod_tensor_impl.h
 create mode 100644 paddle/framework/lod_tensor_test.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 33e6baf818..6601918c90 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -7,6 +7,9 @@ cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
+cc_library(lod_tensor SRCS lod_tensor.cc details/lod_tensor.cc DEPS ddim place tensor)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor)
+
 cc_test(variable_test SRCS variable_test.cc)
 
 cc_library(scope SRCS scope.cc)
diff --git a/paddle/framework/details/lod_tensor.cc b/paddle/framework/details/lod_tensor.cc
new file mode 100644
index 0000000000..9ad3979e5b
--- /dev/null
+++ b/paddle/framework/details/lod_tensor.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_tensor.h"
+
+#include <memory>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+using LOD = LODTensor::LOD;
+
+std::shared_ptr<LOD> SliceLOD(const LOD &lod, size_t level_begin,
+                              size_t level_end) {
+  auto new_lod = std::make_shared<LOD>();
+  new_lod->reserve(level_end - level_begin);
+  for (size_t i = level_begin; i < level_end; i++) {
+    new_lod->emplace_back(lod[i]);
+  }
+  return new_lod;
+}
+
+std::shared_ptr<LOD> SliceLOD(const LOD &lod, size_t level, size_t elem_begin,
+                              size_t elem_end, bool tensor_shared) {
+  // slice the lod.
+  auto new_lod = std::make_shared<LOD>();
+  new_lod->reserve(lod.size() - level);
+  auto start = lod.at(level)[elem_begin];
+  auto end = lod.at(level)[elem_end];
+
+  for (auto it = lod.begin() + level; it != lod.end(); it++) {
+    auto it_begin = std::find(it->begin(), it->end(), start);
+    auto it_end = std::find(it_begin, it->end(), end);
+    PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
+    PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info");
+    new_lod->emplace_back(it_begin, it_end + 1);
+    if (!tensor_shared) {
+      // reset offset if tensor is copyed and sliced.
+      std::transform(new_lod->back().begin(), new_lod->back().end(),
+                     new_lod->back().begin(),
+                     [start](int v) { return v - start; });
+      PADDLE_ENFORCE(new_lod->back().front() == 0, "error in slice LOD");
+    }
+  }
+  return new_lod;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/lod_tensor.h b/paddle/framework/details/lod_tensor.h
new file mode 100644
index 0000000000..9a6a6cd2ea
--- /dev/null
+++ b/paddle/framework/details/lod_tensor.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+/*
+ * Slice levels from LOD.
+ *
+ * @lod: LOD to slice.
+ * @level_begin: level to begin slice.
+ * @level_end: level to end slice.
+ */
+std::shared_ptr<LODTensor::LOD> SliceLOD(const LODTensor::LOD &lod,
+                                         size_t level_begin, size_t level_end);
+
+/*
+ * Slice elements from a level of LOD.
+ *
+ * @lod: LOD to slice.
+ * @level: which level to slice.
+ * @elem_begin: element's index to begin slice.
+ * @elem_end: element's index to end slice.
+ */
+std::shared_ptr<LODTensor::LOD> SliceLOD(const LODTensor::LOD &lod,
+                                         size_t level, size_t elem_begin,
+                                         size_t elem_end, bool tensor_shared);
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
new file mode 100644
index 0000000000..70045dbf7a
--- /dev/null
+++ b/paddle/framework/lod_tensor.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_tensor.h"
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace framework {
+
+LODTensor LODTensor::SliceShared(size_t level_begin, size_t level_end) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level_begin, level_end);
+  // slice levels just need to update LOD info, each level will contains the
+  // whole tensor_, so no need to modify tensor_.
+  return LODTensor(tensor_, new_lod);
+}
+
+LODTensor LODTensor::SliceShared(size_t level, size_t elem_begin,
+                                 size_t elem_end) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                 NumLevels());
+  PADDLE_ENFORCE(elem_begin < NumElements(level),
+                 "element begin [%d] out of range [%d]", elem_begin,
+                 NumElements(level));
+  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
+                 "element end [%d] out of range [%d]", elem_end,
+                 NumElements(level));
+
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level, elem_begin, elem_end,
+                                   true /*tensor_shared*/);
+
+  // slice elements just need to update LOD info, because offsets are not
+  // changed, so the original tensor_ can be reused.
+  return LODTensor(tensor_, new_lod);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
new file mode 100644
index 0000000000..4933479b10
--- /dev/null
+++ b/paddle/framework/lod_tensor.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#if (!PADDLE_ONLY_CPU)
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#endif
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+ * LODTensor (Level of details Tensor)
+ * see https://en.wikipedia.org/wiki/Level_of_details for reference.
+ */
+class LODTensor {
+ public:
+// Level save offsets of each unit.
+#ifdef PADDLE_ONLY_CPU
+  using Level = std::vector<size_t>;
+#else
+  using Level = thrust::device_vector<size_t>;
+#endif
+  // LOD stores offsets of each level of units, the largest units level first,
+  // then the smaller units level. Each Level stores the offsets of units in
+  // Tesor.
+  typedef std::vector<Level> LOD;
+
+  LODTensor() {}
+  LODTensor(const std::shared_ptr<Tensor> &tensor,
+            const std::shared_ptr<LOD> &lod) {
+    Reset(tensor, lod);
+  }
+
+  void Reset(const std::shared_ptr<Tensor> &tensor,
+             const std::shared_ptr<LOD> &lod) {
+    tensor_ = tensor;
+    lod_start_pos_ = lod;
+  }
+
+  /*
+   * Get a element from LOD.
+   */
+  size_t lod_element(size_t level, size_t elem) const {
+    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                   NumLevels());
+    PADDLE_ENFORCE(elem < NumElements(level),
+                   "element begin [%d] out of range [%d]", elem,
+                   NumElements(level));
+    return (*lod_start_pos_)[level][elem];
+  }
+
+  /*
+   * Number of LODTensor's levels, each level has units of data, for example,
+   * in the sentence's view, article, paragraph, sentence are 3 levels.
+   */
+  size_t NumLevels() const {
+    return lod_start_pos_ ? lod_start_pos_->size() : 0UL;
+  }
+  /*
+   * Number of elements in a level.
+   */
+  size_t NumElements(size_t level = 0) const {
+    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                   NumLevels());
+    // the last offset is the end of last element
+    return lod_start_pos_->at(level).size() - 1;
+  }
+
+  /*
+   * Slice of levels[level_begin:level_end], with tensor copied.
+   */
+  template <typename T>
+  LODTensor SliceCopied(size_t level_begin, size_t level_end,
+                        const platform::Place &dst_place) const;
+
+  /*
+   * Slice of levels[level_begin:level_end], with tensor shared.
+   */
+  LODTensor SliceShared(size_t level_begin, size_t level_end) const;
+
+  /*
+   * Slice of elements of a level, [elem_begin: elem_end], with tensor copied.
+   * @note: low performance in slice lod_start_pos_.
+   */
+  template <typename T>
+  LODTensor SliceCopied(size_t level, size_t elem_begin, size_t elem_end,
+                        const platform::Place &dst_place) const;
+
+  /*
+   * Slice of elements of a level, [elem_begin: elem_end], with tensor shared.
+   * @note: low performance in slice lod_start_pos_.
+   */
+  LODTensor SliceShared(size_t level, size_t elem_begin, size_t elem_end) const;
+
+  /*
+   * Copy other's lod_start_pos_, to share LOD info.
+   * @note: the LOD info should not be changed.
+   */
+  void ShareLOD(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+
+  /*
+   * Copy other's lod_start_pos_'s content, free to mutate.
+   */
+  void CopyLOD(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared<LOD>(*other.lod_start_pos_);
+  }
+  /*
+   * Determine whether LODTensor has a valid LOD info.
+   */
+  bool HasLOD() const { return bool(lod_start_pos_); }
+  LOD *lod() const { return lod_start_pos_.get(); }
+
+  std::shared_ptr<Tensor> &tensor() { return tensor_; }
+  Tensor *raw_tensor() { return tensor_.get(); }
+
+ private:
+  std::shared_ptr<LOD> lod_start_pos_;
+  std::shared_ptr<Tensor> tensor_;
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+#include "paddle/framework/lod_tensor_impl.h"
diff --git a/paddle/framework/lod_tensor_impl.h b/paddle/framework/lod_tensor_impl.h
new file mode 100644
index 0000000000..0eb6469aea
--- /dev/null
+++ b/paddle/framework/lod_tensor_impl.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/details/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+LODTensor LODTensor::SliceCopied(size_t level_begin, size_t level_end,
+                                 const platform::Place &dst_place) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level_begin, level_end);
+  auto new_tensor = std::make_shared<Tensor>();
+  new_tensor->CopyFrom<T>(*tensor_, dst_place);
+
+  return LODTensor(new_tensor, new_lod);
+}
+
+template <typename T>
+LODTensor LODTensor::SliceCopied(size_t level, size_t elem_begin,
+                                 size_t elem_end,
+                                 const platform::Place &dst_place) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                 NumLevels());
+  PADDLE_ENFORCE(elem_begin < NumElements(level),
+                 "element begin [%d] out of range [%d]", elem_begin,
+                 NumElements(level));
+  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
+                 "element end [%d] out of range [%d]", elem_end,
+                 NumElements(level));
+
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level, elem_begin, elem_end,
+                                   false /*tensor_shared*/);
+
+  auto start_idx = new_lod->front().front();
+  auto end_idx = new_lod->front().back() - 1 /*the next element's start*/;
+  auto sliced_tensor = tensor_->Slice<T>(start_idx, end_idx);
+  auto new_tensor = std::make_shared<Tensor>();
+  new_tensor->CopyFrom<T>(sliced_tensor, dst_place);
+
+  return LODTensor(new_tensor, new_lod);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
new file mode 100644
index 0000000000..511716375e
--- /dev/null
+++ b/paddle/framework/lod_tensor_test.cc
@@ -0,0 +1,165 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/lod_tensor.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+
+namespace paddle {
+namespace framework {
+
+class LODTensorTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    lod_tensor.reset(new LODTensor);
+    // tensor's batch_size: 30
+    // 3 levels
+    // 0 10 20
+    // 0 5 10 15 20
+    // 0 2 5 7 10 12 15 20
+    auto lod = std::make_shared<LODTensor::LOD>();
+    lod->push_back(std::vector<size_t>{0, 10, 20});
+    lod->push_back(std::vector<size_t>{0, 5, 10, 15, 20});
+    lod->push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
+
+    auto tensor = std::make_shared<Tensor>();
+    tensor->Resize({20 /*batch size*/, 128 /*dim*/});
+    // malloc memory
+    tensor->mutable_data<float>(place);
+
+    lod_tensor->Reset(tensor, lod);
+  }
+
+ protected:
+  std::unique_ptr<LODTensor> lod_tensor;
+  platform::CPUPlace place;
+};
+
+TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor->NumLevels(), 3UL); }
+
+TEST_F(LODTensorTester, NumElements) {
+  ASSERT_EQ(lod_tensor->NumElements(0), 2UL);
+  ASSERT_EQ(lod_tensor->NumElements(1), 4UL);
+  ASSERT_EQ(lod_tensor->NumElements(2), 8UL);
+}
+
+TEST_F(LODTensorTester, SliceShared_Level) {
+  // slice 1 level
+  for (size_t level = 0; level < 3UL; ++level) {
+    auto new_lod_tensor = lod_tensor->SliceShared(level, level + 1);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
+    ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+  }
+  // slice 2 level
+  for (size_t level = 0; level < 2UL; ++level) {
+    auto new_lod_tensor = lod_tensor->SliceShared(level, level + 2);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(1),
+              lod_tensor->NumElements(level + 1));
+    ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+  }
+}
+
+TEST_F(LODTensorTester, SliceCopied_Level) {
+  // slice 1 level
+  for (size_t level = 0; level < 3UL; ++level) {
+    auto new_lod_tensor =
+        lod_tensor->SliceCopied<float>(level, level + 1, place);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
+    // ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+    // TODO(superjom) add tensor comparation here.
+  }
+  // slice 2 level
+  for (size_t level = 0; level < 2UL; ++level) {
+    auto new_lod_tensor =
+        lod_tensor->SliceCopied<float>(level, level + 2, place);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(1),
+              lod_tensor->NumElements(level + 1));
+    // ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+    // TODO(superjom) add tensor comparation here.
+  }
+}
+
+TEST_F(LODTensorTester, SliceShared_Element) {
+  size_t level = 0;
+  auto new_lod_tensor = lod_tensor->SliceShared(level, 0, 2);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  ASSERT_EQ(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+
+  level = 1;
+  new_lod_tensor = lod_tensor->SliceShared(level, 0, 2);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+}
+
+TEST_F(LODTensorTester, SliceCopied_Element) {
+  size_t level = 0;
+  auto new_lod_tensor = lod_tensor->SliceCopied<float>(level, 0, 2, place);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  ASSERT_NE(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+
+  level = 1;
+  new_lod_tensor = lod_tensor->SliceCopied<float>(level, 0, 2, place);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_NE(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+
+  level = 1;
+  // LOD is
+  //    0 5 10
+  //    0 2 5 7 10
+  new_lod_tensor = lod_tensor->SliceCopied<float>(level, 1, 3, place);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+
+  ASSERT_EQ(new_lod_tensor.lod_element(0, 0), 0UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(0, 1), 5UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 0), 0UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 1), 2UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 2), 5UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 3), 7UL);
+
+  // TODO(superjom) compare the content of these tensors
+}
+
+TEST_F(LODTensorTester, ShareLOD) {
+  LODTensor new_lod_tensor;
+  new_lod_tensor.ShareLOD(*lod_tensor);
+  ASSERT_EQ(new_lod_tensor.lod(), lod_tensor->lod());
+}
+
+TEST_F(LODTensorTester, CopyLOD) {
+  LODTensor new_lod_tensor;
+  new_lod_tensor.CopyLOD(*lod_tensor);
+  ASSERT_NE(new_lod_tensor.lod(), lod_tensor->lod());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index c44df05e4b..b57958591f 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <cstring>
 #include <memory>
 #include <typeindex>
+#include <vector>
+
 #include "paddle/framework/ddim.h"
 #include "paddle/memory/memory.h"
 #include "paddle/platform/device_context.h"
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 20276181b9..7db38d5cae 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -19,7 +19,7 @@ TEST(Tensor, Dims) {
   using namespace paddle::framework;
   using namespace paddle::platform;
   Tensor tt;
-  tt.Resize(make_ddim({2, 3, 4}));
+  tt.Resize({2, 3, 4});
   DDim dims = tt.dims();
   ASSERT_EQ(arity(dims), 3);
   for (int i = 0; i < 3; ++i) {

From b008360b05cea60634a6afe07b7f2309fc6ea28e Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 9 Aug 2017 13:08:40 +0800
Subject: [PATCH 37/51] merge InferShapeContext and OperatorContext (#3347)

* merge InferShapeContext and OperatorContext

* OperatorBase& instead of OperatorBase*
---
 paddle/framework/operator.h | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index c324fa6702..ceef9f028b 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -120,10 +120,10 @@ class OperatorBase {
   std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
 };
 
-class OperatorContext {
+class InferShapeContext {
  public:
-  OperatorContext(const OperatorBase* op, const Scope& scope)
-      : op_(*op), scope_(scope) {}
+  InferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
 
   size_t InputSize() const { return op_.inputs_.size(); }
 
@@ -234,12 +234,6 @@ class OperatorContext {
   const Scope& scope_;
 };
 
-class InferShapeContext : public OperatorContext {
- public:
-  InferShapeContext(const OperatorBase* op, const Scope& scope)
-      : OperatorContext(op, scope) {}
-};
-
 template <typename T>
 struct EigenDeviceConverter;
 
@@ -255,11 +249,11 @@ struct EigenDeviceConverter<platform::GPUPlace> {
 };
 #endif
 
-class ExecutionContext : public OperatorContext {
+class ExecutionContext : public InferShapeContext {
  public:
-  ExecutionContext(const OperatorBase* op, const Scope& scope,
+  ExecutionContext(const OperatorBase& op, const Scope& scope,
                    const platform::DeviceContext* device_context)
-      : OperatorContext(op, scope), device_context_(device_context) {}
+      : InferShapeContext(op, scope), device_context_(device_context) {}
 
   template <typename PlaceType,
             typename DeviceType =
@@ -311,13 +305,13 @@ class OperatorWithKernel : public OperatorBase {
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
   void InferShape(const Scope& scope) const override {
-    InferShape(InferShapeContext(this, scope));
+    InferShape(InferShapeContext(*this, scope));
   }
 
   void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const final {
     auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(ExecutionContext(this, scope, &dev_ctx));
+    opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx));
   }
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&

From a573dd4cc6f5a41ddbeec1be560d587f61029005 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 9 Aug 2017 13:21:12 +0800
Subject: [PATCH 38/51] Use ostream << operator to get to_string

* Make `PADDLE_ENFORCE_EQ` supports custom class, like DDim
---
 paddle/platform/enforce.h       |  7 ++---
 paddle/platform/enforce_test.cc | 40 +++++++++++++++++++++++++++-
 paddle/string/CMakeLists.txt    |  1 +
 paddle/string/to_string.h       | 40 ++++++++++++++++++++++++++++
 paddle/string/to_string_test.cc | 46 +++++++++++++++++++++++++++++++++
 5 files changed, 130 insertions(+), 4 deletions(-)
 create mode 100644 paddle/string/to_string.h
 create mode 100644 paddle/string/to_string_test.cc

diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index d2adb997de..337a059fb1 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -15,11 +15,12 @@ limitations under the License. */
 #pragma once
 
 #include <execinfo.h>
-#include <paddle/string/printf.h>
 #include <iomanip>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include "paddle/string/printf.h"
+#include "paddle/string/to_string.h"
 
 #ifndef PADDLE_ONLY_CPU
 
@@ -194,8 +195,8 @@ inline void throw_on_error(T e) {
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)        \
   PADDLE_ENFORCE(__VAL0 __CMP __VAL1,                                         \
                  "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
-                 #__VAL0, #__VAL1, std::to_string(__VAL0),                    \
-                 std::to_string(__VAL1),                                      \
+                 #__VAL0, #__VAL1, paddle::string::to_string(__VAL0),         \
+                 paddle::string::to_string(__VAL1),                           \
                  paddle::string::Sprintf("" __VA_ARGS__));
 
 }  // namespace platform
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
index 5408fce558..80bdee3d9d 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -9,6 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <array>
+#include <iostream>
 #include <memory>
 
 #include "gtest/gtest.h"
@@ -83,7 +85,7 @@ TEST(ENFORCE_NE, FAIL) {
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-                          "enforce 1.0 != 1UL failed, 1.000000 == 1"))
+                          "enforce 1.0 != 1UL failed, 1 == 1"))
         << error.what() << " does not have expected prefix";
   }
   EXPECT_TRUE(caught_exception);
@@ -176,3 +178,39 @@ TEST(ENFORCE_NOT_NULL, FAIL) {
   }
   EXPECT_TRUE(caught_exception);
 }
+
+struct Dims {
+  size_t dims_[4];
+
+  bool operator==(const Dims& o) const {
+    for (size_t i = 0; i < 4; ++i) {
+      if (dims_[i] != o.dims_[i]) return false;
+    }
+    return true;
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const Dims& d) {
+  for (size_t i = 0; i < 4; ++i) {
+    if (i == 0) {
+      os << "[";
+    }
+    os << d.dims_[i];
+    if (i == 4 - 1) {
+      os << "]";
+    } else {
+      os << ", ";
+    }
+  }
+  return os;
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
+  Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
+  PADDLE_ENFORCE_EQ(a, b);
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
+  Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
+  ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
+}
\ No newline at end of file
diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
index 5becf62672..60667b7287 100644
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -2,3 +2,4 @@ cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
+cc_test(to_string_test SRCS to_string_test.cc)
diff --git a/paddle/string/to_string.h b/paddle/string/to_string.h
new file mode 100644
index 0000000000..4f478b6a36
--- /dev/null
+++ b/paddle/string/to_string.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <sstream>
+#include <string>
+
+namespace paddle {
+namespace string {
+template <typename T>
+inline std::string to_string(T v) {
+  std::ostringstream sout;
+  sout << v;
+  return sout.str();
+}
+
+// Faster std::string/const char* type
+template <>
+inline std::string to_string(std::string v) {
+  return v;
+}
+
+template <>
+inline std::string to_string(const char* v) {
+  return std::string(v);
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
new file mode 100644
index 0000000000..0ef06eac24
--- /dev/null
+++ b/paddle/string/to_string_test.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/string/to_string.h"
+#include <gtest/gtest.h>
+
+constexpr char OUT_STR[] = "User Defined Output";
+class UserDefinedClass {
+public:
+};
+
+std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
+  s << OUT_STR;
+  return s;
+}
+
+TEST(to_string, normal) {
+  using namespace paddle::string;
+  ASSERT_EQ(std::to_string(10), to_string(10));
+  ASSERT_EQ("abc", to_string("abc"));
+
+  auto std_to_string = std::to_string(1.2);
+  auto my_to_string = to_string(1.2);
+
+  // std::to_string might fill zero after float value, like 1.2000
+  for (size_t i = 0; i < my_to_string.size(); ++i) {
+    ASSERT_EQ(my_to_string[i], std_to_string[i]);
+  }
+}
+
+TEST(to_string, user_defined) {
+  using namespace paddle::string;
+  UserDefinedClass instance;
+  ASSERT_EQ(OUT_STR, to_string(instance));
+}
\ No newline at end of file

From e67a1c928d6ee3c0588d6b31c510c3e41ef83b38 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 9 Aug 2017 13:59:07 +0800
Subject: [PATCH 39/51] Make android compile pass

---
 paddle/string/to_string_test.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
index 0ef06eac24..57b4010626 100644
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -25,6 +25,11 @@ std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
   return s;
 }
 
+// android macro comes from
+// https://stackoverflow.com/questions/15328751/android-macro-suddenly-not-defined
+#if !defined(ANDROID) && !defined(__ANDROID__)
+// In android, std::to_string is not defined.
+// https://stackoverflow.com/questions/22774009/android-ndk-stdto-string-support
 TEST(to_string, normal) {
   using namespace paddle::string;
   ASSERT_EQ(std::to_string(10), to_string(10));
@@ -38,6 +43,7 @@ TEST(to_string, normal) {
     ASSERT_EQ(my_to_string[i], std_to_string[i]);
   }
 }
+#endif
 
 TEST(to_string, user_defined) {
   using namespace paddle::string;

From d8a3291d87f20b6e4973bd9735e7a761752a10f1 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 9 Aug 2017 14:14:47 +0800
Subject: [PATCH 40/51] Refine unit-test for to_string

---
 paddle/string/to_string_test.cc | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
index 57b4010626..4a075751ac 100644
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -25,25 +25,12 @@ std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
   return s;
 }
 
-// android macro comes from
-// https://stackoverflow.com/questions/15328751/android-macro-suddenly-not-defined
-#if !defined(ANDROID) && !defined(__ANDROID__)
-// In android, std::to_string is not defined.
-// https://stackoverflow.com/questions/22774009/android-ndk-stdto-string-support
 TEST(to_string, normal) {
   using namespace paddle::string;
-  ASSERT_EQ(std::to_string(10), to_string(10));
+  ASSERT_EQ("10", to_string(10));
   ASSERT_EQ("abc", to_string("abc"));
-
-  auto std_to_string = std::to_string(1.2);
-  auto my_to_string = to_string(1.2);
-
-  // std::to_string might fill zero after float value, like 1.2000
-  for (size_t i = 0; i < my_to_string.size(); ++i) {
-    ASSERT_EQ(my_to_string[i], std_to_string[i]);
-  }
+  ASSERT_EQ("1.2", to_string(1.2));
 }
-#endif
 
 TEST(to_string, user_defined) {
   using namespace paddle::string;

From 2d35c7008117cc2ec7c1a079947fa4537d6d2f58 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 9 Aug 2017 14:29:24 +0800
Subject: [PATCH 41/51] Fit google name style

---
 paddle/string/to_string_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
index 4a075751ac..5ff1b007f1 100644
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -15,13 +15,13 @@
 #include "paddle/string/to_string.h"
 #include <gtest/gtest.h>
 
-constexpr char OUT_STR[] = "User Defined Output";
+constexpr char kOutputString[] = "User Defined Output";
 class UserDefinedClass {
 public:
 };
 
 std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
-  s << OUT_STR;
+  s << kOutputString;
   return s;
 }
 
@@ -35,5 +35,5 @@ TEST(to_string, normal) {
 TEST(to_string, user_defined) {
   using namespace paddle::string;
   UserDefinedClass instance;
-  ASSERT_EQ(OUT_STR, to_string(instance));
+  ASSERT_EQ(kOutputString, to_string(instance));
 }
\ No newline at end of file

From df4fe671fe59863b5cbb3b595da544016b678199 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 9 Aug 2017 14:47:56 +0800
Subject: [PATCH 42/51] "remove attribute"

---
 paddle/framework/operator.cc | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 511323b6f5..d9a013b883 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/operator.h"
 #include <algorithm>
-#include <iterator>
+
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
@@ -103,16 +103,6 @@ std::string OperatorBase::DebugString() const {
       ss << ", ";
     }
   }
-  ss << "), ";
-  ss << "Attrs:(";
-  size_t i = 0;
-  for (auto& attr : attrs_) {
-    ss << attr.first;
-    if (i != attrs_.size() - 1) {
-      ss << ", ";
-    }
-    i++;
-  }
   ss << ").";
   return ss.str();
 }

From 6bac3e17b5b1f9e6a0ebb34ff43e959a971ef111 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 9 Aug 2017 15:01:37 +0800
Subject: [PATCH 43/51] "remove unused test net modified"

---
 paddle/operators/gaussian_random_op.cc               |  7 ++++---
 paddle/operators/gaussian_random_op.cu               |  5 ++---
 .../v2/framework/tests/test_gaussian_random_op.py    |  4 +++-
 python/paddle/v2/framework/tests/test_net.py         | 12 ++++++------
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index b0b68ff36d..ef417ae2f0 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -22,8 +22,8 @@ template <typename T>
 class GaussianRandomKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    T mean = static_cast<T>(context.op_.GetAttr<T>("mean"));
-    T std = static_cast<T>(context.op_.GetAttr<T>("std"));
+    float mean = context.op_.GetAttr<float>("mean");
+    float std = context.op_.GetAttr<float>("std");
     auto* tensor = context.Output<framework::Tensor>(0);
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
@@ -35,7 +35,8 @@ class GaussianRandomKernel : public framework::OpKernel {
     }
     std::mt19937 g(seed);
     std::normal_distribution<T> distribution(mean, std);
-    for (int i = 0; i < framework::product(tensor->dims()); ++i) {
+    ssize_t size = framework::product(tensor->dims());
+    for (int i = 0; i < size; ++i) {
       data[i] = distribution(g);
     }
   }
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index 164753f946..54e4ae5d2b 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -26,8 +26,8 @@ template <typename T>
 class GaussianRandomKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    T mean = static_cast<T>(context.op_.GetAttr<T>("mean"));
-    T std = static_cast<T>(context.op_.GetAttr<T>("std"));
+    float mean = context.op_.GetAttr<float>("mean");
+    float std = context.op_.GetAttr<float>("std");
     auto* tensor = context.Output<framework::Tensor>(0);
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
@@ -40,7 +40,6 @@ class GaussianRandomKernel : public framework::OpKernel {
         &g, CURAND_RNG_PSEUDO_DEFAULT));
     PADDLE_ENFORCE(
         platform::dynload::curandSetPseudoRandomGeneratorSeed(g, seed));
-    // auto g = const_cast<platform::GPUDeviceContext*>(ctx)->RandGenerator();
     curandGenerateNormal(g, data, framework::product(tensor->dims()), mean,
                          std);
   }
diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
index 0ff8c89a14..20c68007b5 100644
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -14,13 +14,15 @@ class GaussianRandomTest(unittest.TestCase):
     def test_gaussian_random(self, place):
         scope = core.Scope()
         scope.new_var("Out").get_tensor()
+
         op = Operator(
             "gaussian_random",
             Out="Out",
             dims=[1000, 784],
             mean=.0,
             std=1.,
-            seed=0)
+            seed=10)
+
         op.infer_shape(scope)
         context = core.DeviceContext.create(place)
         op.run(scope, context)
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
index 7df9b997b1..b30896553d 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -16,13 +16,13 @@ class TestNet(unittest.TestCase):
         net.complete_add_op(True)
 
         expected = '''
-        Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
-        Op(add_two), inputs:(X, Y), outputs:(Out).
-        Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
+Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
+    Op(add_two), inputs:(X, Y), outputs:(Out).
+    Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
         Op(fc), inputs:(X, w, @EMPTY@), outputs:(fc.out, @TEMP@fc@0).
-        Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
-        Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
-        '''
+            Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
+            Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
+'''
         self.assertEqual(expected, "\n" + str(net))
 
 

From b228b463fa6f1a4cf1f102dcea1eff61f16cc698 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 9 Aug 2017 15:09:57 +0800
Subject: [PATCH 44/51] Make const variables in operator.h fit google style

* No POD instance is forbidden in global scope.

See
https://google.github.io/styleguide/cppguide.html#Static_and_Global_Variables
---
 paddle/framework/backward.cc             |  6 ++--
 paddle/framework/backward_test.cc        | 31 +++++++++---------
 paddle/framework/grad_op_builder_test.cc | 41 +++++++++++-------------
 paddle/framework/operator.h              |  8 ++---
 paddle/operators/mean_op.cc              |  2 +-
 paddle/operators/mean_op.h               |  4 +--
 6 files changed, 44 insertions(+), 48 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 47983110fa..be6656792f 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -133,8 +133,8 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
     std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
     for (std::string& grad_input : grad_op->inputs_) {
       if (no_grad_names.count(grad_input)) {
-        std::string prefix =
-            grad_input.substr(0, grad_input.size() - kGradVarSuffix.size());
+        std::string prefix = grad_input.substr(
+            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char));
         grad_input = prefix + kZeroVarSuffix;
 
         // If part of input gradient of that operator is not calculated, fill
@@ -167,7 +167,7 @@ std::shared_ptr<OperatorBase> Backward(
   std::unordered_set<std::string> no_grad_names;
   no_grad_names.reserve(no_grad_vars.size());
 
-  no_grad_names.insert(kEmptyVarName + kGradVarSuffix);
+  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
 
   for (auto& name : no_grad_vars) {
     no_grad_names.insert(name + kGradVarSuffix);
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 6d5835bd22..1677a3ed4c 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -171,10 +171,10 @@ TEST(Backward, simple_op_grad) {
   ASSERT_EQ(4UL, gop->inputs_.size());
   ASSERT_EQ(f::kEmptyVarName, gop->inputs_[0]);
   ASSERT_EQ("rowwise_add_grad", gop->type_);
-  ASSERT_EQ("X" + f::kGradVarSuffix, gop->outputs_[0]);
-  ASSERT_EQ("b" + f::kGradVarSuffix, gop->outputs_[1]);
+  ASSERT_EQ(f::GradVarName("X"), gop->outputs_[0]);
+  ASSERT_EQ(f::GradVarName("b"), gop->outputs_[1]);
 
-  ASSERT_EQ("X" + f::kGradVarSuffix, gop->Output("X" + f::kGradVarSuffix));
+  ASSERT_EQ(f::GradVarName("X"), gop->Output(f::GradVarName("X")));
 }
 
 TEST(Backward, simple_op_not_need_grad) {
@@ -182,7 +182,7 @@ TEST(Backward, simple_op_not_need_grad) {
   ASSERT_NE(fwd, nullptr);
   auto gop = f::Backward(*fwd, {"X"});
   ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(),
-                      "X" + f::kGradVarSuffix),
+                      f::GradVarName("X")),
             gop->outputs_.end());
 
   auto no_input_gop = f::Backward(*fwd, {"X", "b"});
@@ -250,18 +250,18 @@ TEST(Backward, net_input_of_network_not_need_grad) {
   all_output.erase(f::kEmptyVarName);
 
   for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
-    ASSERT_NE(all_output.find(out + f::kGradVarSuffix), all_output.end());
+    ASSERT_NE(all_output.find(f::GradVarName(out)), all_output.end());
   }
 
   // Not Generated X
-  ASSERT_EQ(all_output.find("X" + f::kGradVarSuffix), all_output.end());
+  ASSERT_EQ(all_output.find(f::GradVarName("X")), all_output.end());
 
   ASSERT_EQ(2UL, bwd_net->ops_.size());
   ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
   auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
   ASSERT_EQ(3UL, first_fc_grad->ops_.size());
   ASSERT_EQ(f::kEmptyVarName,
-            first_fc_grad->ops_[2]->Output("A" + f::kGradVarSuffix));
+            first_fc_grad->ops_[2]->Output(f::GradVarName("A")));
 }
 
 TEST(Backward, net_shared_weight) {
@@ -313,15 +313,15 @@ TEST(Backward, op_part_of_output_are_not_need) {
   ASSERT_EQ(1UL, fill_zero.inputs_.size());
   ASSERT_EQ("Z", fill_zero.inputs_[0]);
   ASSERT_EQ(1UL, fill_zero.outputs_.size());
-  ASSERT_EQ("Z" + f::kZeroVarSuffix, fill_zero.outputs_[0]);
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.outputs_[0]);
 
   auto &d_many_out = *net->ops_[1];
   ASSERT_EQ("many_output_op_grad", d_many_out.type_);
   ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size());  // I/O/OG
-  ASSERT_EQ("Z" + f::kZeroVarSuffix, d_many_out.Input("z" + f::kGradVarSuffix));
-  ASSERT_EQ("Y" + f::kGradVarSuffix, d_many_out.Input("y" + f::kGradVarSuffix));
-  ASSERT_EQ("X" + f::kGradVarSuffix,
-            d_many_out.Output("x" + f::kGradVarSuffix));
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
+            d_many_out.Input(f::GradVarName("z")));
+  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
+  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
 }
 
 TEST(Backward, op_part_of_input_are_not_need) {
@@ -331,10 +331,9 @@ TEST(Backward, op_part_of_input_are_not_need) {
   ASSERT_EQ(grad_mul.type_, "mul_grad");
   ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL);
   ASSERT_EQ(grad_mul.outputs_.size(), 2UL);
-  ASSERT_EQ(grad_mul.Output("A" + f::kGradVarSuffix), f::kEmptyVarName);
-  ASSERT_EQ(grad_mul.Output("B" + f::kGradVarSuffix), "b" + f::kGradVarSuffix);
-  ASSERT_EQ(grad_mul.Input("Out" + f::kGradVarSuffix),
-            "out" + f::kGradVarSuffix);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("A")), f::kEmptyVarName);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("B")), f::GradVarName("b"));
+  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
   ASSERT_EQ(grad_mul.Input("A"), "a");
   ASSERT_EQ(grad_mul.Input("B"), "b");
   ASSERT_EQ(grad_mul.Input("Out"), "out");
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
index cf7143eba4..f1ebbae52f 100644
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -83,21 +83,19 @@ TEST(GradOpBuilder, MutiInOut) {
   EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
   EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
             std::vector<std::string>({"out2_1", "out2_2"}));
-  EXPECT_EQ(grad_test_op->Input("Out1" + f::kGradVarSuffix),
-            "out1" + f::kGradVarSuffix);
-  EXPECT_EQ(grad_test_op->Inputs("Out2_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out1")),
+            f::GradVarName("out1"));
+  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out2_mult")),
             std::vector<std::string>(
-                {"out2_1" + f::kGradVarSuffix, "out2_2" + f::kGradVarSuffix}));
+                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
 
   ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
-  EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
-            "in1" + f::kGradVarSuffix);
-  EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
-            std::vector<std::string>({"in2_1" + f::kGradVarSuffix,
-                                      "in2_2" + f::kGradVarSuffix,
-                                      "in2_3" + f::kGradVarSuffix}));
-  EXPECT_EQ(grad_test_op->Output("In3" + f::kGradVarSuffix),
-            "in3" + f::kGradVarSuffix);
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
+            std::vector<std::string>({f::GradVarName("in2_1"),
+                                      f::GradVarName("in2_2"),
+                                      f::GradVarName("in2_3")}));
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In3")), f::GradVarName("in3"));
 }
 
 TEST(GradOpBuilder, IOIgnoredInGradient) {
@@ -119,19 +117,18 @@ TEST(GradOpBuilder, IOIgnoredInGradient) {
   EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
             std::vector<std::string>({"out1_1", "out1_2"}));
   EXPECT_EQ(grad_test_op->Input("Out2"), f::kEmptyVarName);
-  EXPECT_EQ(grad_test_op->Inputs("Out1_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
             std::vector<std::string>(
-                {"out1_1" + f::kGradVarSuffix, "out1_2" + f::kGradVarSuffix}));
-  EXPECT_EQ(grad_test_op->Input("Out2" + f::kGradVarSuffix),
-            "out2" + f::kGradVarSuffix);
+                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
+  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
+            f::GradVarName("out2"));
 
   ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
-  EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
-            "in1" + f::kGradVarSuffix);
-  EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
             std::vector<std::string>(
-                {"in2_1" + f::kGradVarSuffix, "in2_2" + f::kGradVarSuffix}));
-  EXPECT_EQ(grad_test_op->Outputs("In3_mult" + f::kGradVarSuffix),
+                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In3_mult")),
             std::vector<std::string>(
-                {"in3_1" + f::kGradVarSuffix, "in3_2" + f::kGradVarSuffix}));
+                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
 }
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index ceef9f028b..8949baf60e 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -33,19 +33,19 @@ namespace paddle {
 namespace framework {
 
 /// If a variable is a empty variable, that name will be used.
-const std::string kEmptyVarName = "@EMPTY@";
+constexpr char kEmptyVarName[] = "@EMPTY@";
 
 /// If a variable is a temporary variable, that name will be set in Python,
 /// but it will be convert to a unique name in scope after OpCreator.
-const std::string kTempVarName = "@TEMP@";
+constexpr char kTempVarName[] = "@TEMP@";
 
 /// If a variable's name has a certain suffix, it means that the
 /// variable is the gradient of another varibale.
 /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
-const std::string kGradVarSuffix = "@GRAD";
+constexpr char kGradVarSuffix[] = "@GRAD";
 
 /// Variables with this suffix are supposed to be filled up with zeros.
-const std::string kZeroVarSuffix = "@ZERO";
+constexpr char kZeroVarSuffix[] = "@ZERO";
 
 inline std::string GradVarName(const std::string& var_name) {
   return var_name + kGradVarSuffix;
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 997b0c514e..2ea049cb36 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -41,7 +41,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
 class MeanGradOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<Tensor>("X" + framework::kGradVarSuffix)
+    ctx.Output<Tensor>(framework::GradVarName("X"))
         ->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index f3db0a29bb..e8595a14fa 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -48,10 +48,10 @@ template <typename Place, typename T>
 class MeanGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto OG = context.Input<Tensor>("Out" + framework::kGradVarSuffix);
+    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
     PADDLE_ENFORCE(framework::product(OG->dims()) == 1,
                    "Mean Gradient should be scalar");
-    auto IG = context.Output<Tensor>("X" + framework::kGradVarSuffix);
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
     IG->mutable_data<T>(context.GetPlace());
 
     T ig_size = (T)framework::product(IG->dims());

From 6c7c4333f83b43de3c4cd6813cf6433bb563b56f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 9 Aug 2017 16:05:22 +0800
Subject: [PATCH 45/51] Fix TravisCI test

---
 paddle/framework/backward.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index be6656792f..437a44a8aa 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -133,8 +133,9 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
     std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
     for (std::string& grad_input : grad_op->inputs_) {
       if (no_grad_names.count(grad_input)) {
+        // +1 for \0
         std::string prefix = grad_input.substr(
-            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char));
+            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
         grad_input = prefix + kZeroVarSuffix;
 
         // If part of input gradient of that operator is not calculated, fill

From bbd7378b4386623b1946a7e5ae82be4cfb2f01e2 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 9 Aug 2017 17:08:56 +0800
Subject: [PATCH 46/51] "ci job failed weired. restart ci job."

---
 .../v2/framework/tests/test_gaussian_random_op.py     | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
index 20c68007b5..f95ed70b58 100644
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -6,12 +6,13 @@ import numpy
 
 class GaussianRandomTest(unittest.TestCase):
     def test_cpu(self):
-        self.test_gaussian_random(place=core.CPUPlace())
+        self.gaussian_random_test(place=core.CPUPlace())
 
     def test_gpu(self):
-        self.test_gaussian_random(place=core.GPUPlace(0))
+        if core.is_compile_gpu():
+            self.gaussian_random_test(place=core.GPUPlace(0))
 
-    def test_gaussian_random(self, place):
+    def gaussian_random_test(self, place):
         scope = core.Scope()
         scope.new_var("Out").get_tensor()
 
@@ -27,8 +28,8 @@ class GaussianRandomTest(unittest.TestCase):
         context = core.DeviceContext.create(place)
         op.run(scope, context)
         tensor = numpy.array(scope.find_var("Out").get_tensor())
-        self.assertAlmostEqual(numpy.mean(tensor), .0, places=3)
-        self.assertAlmostEqual(numpy.std(tensor), 1., places=3)
+        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
+        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
 
 
 if __name__ == '__main__':

From c957445c72fd8f2c0354d8b430ef37f47ac3bc73 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 9 Aug 2017 17:51:21 +0800
Subject: [PATCH 47/51] A better error message for gradient checker

* Give which parameter, which element are wrong. And what max_diff is.
---
 paddle/framework/pybind.cc                    |  9 +++-
 .../v2/framework/tests/gradient_checker.py    | 41 +++++++++++--------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/paddle/framework/pybind.cc b/paddle/framework/pybind.cc
index 915ffb1c00..9139a496ec 100644
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/string/to_string.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
@@ -205,9 +206,13 @@ All parameter, weight, gradient are variables in Paddle.
                   });
   // clang-format on
 
-  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
+  py::class_<platform::GPUPlace>(m, "GPUPlace")
+      .def(py::init<int>())
+      .def("__str__", string::to_string<const platform::GPUPlace &>);
 
-  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CPUPlace &>);
 
   py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base(
       m, "Operator");
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index b73c4869d1..7c4eda5f30 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -92,15 +92,26 @@ def get_numeric_gradient(op,
 
 
 class GradientChecker(unittest.TestCase):
-    def __is_close(self, numeric_grads, scope, max_relative_error):
+    def __is_close(self, numeric_grads, scope, max_relative_error, msg_prefix):
         for name in numeric_grads:
-            op_grad = numpy.array(
-                scope.find_var(grad_var_name(name)).get_tensor())
-            is_close = numpy.allclose(
-                numeric_grads[name], op_grad, rtol=max_relative_error, atol=100)
-            if not is_close:
-                return False
-        return True
+            b = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+            a = numeric_grads[name]
+
+            abs_a = numpy.abs(a)
+            # if abs_a is nearly zero, then use abs error for a, not relative
+            # error.
+            abs_a[abs_a < 1e-3] = 1
+
+            diff_mat = numpy.abs(a - b) / abs_a
+            max_diff = numpy.max(diff_mat)
+
+            def err_msg():
+                offset = numpy.argmax(diff_mat > max_relative_error)
+                return "%s Variable %s max gradient diff %f over limit %f, the first " \
+                       "error element is %d" % (
+                       msg_prefix, name, max_diff, max_relative_error, offset)
+
+            self.assertLessEqual(max_diff, max_relative_error, err_msg())
 
     def check_grad(self,
                    forward_op,
@@ -145,7 +156,8 @@ class GradientChecker(unittest.TestCase):
         # get numeric gradient
         for check_name in inputs_to_check:
             numeric_grad[check_name] = \
-                get_numeric_gradient(forward_op, input_vars, output_name, check_name)
+                get_numeric_gradient(forward_op, input_vars, output_name,
+                                     check_name)
 
         # get operator gradient according to different device
         for place in places:
@@ -187,15 +199,8 @@ class GradientChecker(unittest.TestCase):
             backward_op.infer_shape(scope)
             backward_op.run(scope, ctx)
 
-            if isinstance(place, core.CPUPlace):
-                msg = "CPU kernel gradient is not close to numeric gradient"
-            else:
-                if isinstance(place, core.GPUPlace):
-                    msg = "GPU kernel gradient is not close to numeric gradient"
-                else:
-                    raise ValueError("unknown place " + type(place))
-            self.assertTrue(
-                self.__is_close(numeric_grad, scope, max_relative_error), msg)
+            self.__is_close(numeric_grad, scope, max_relative_error,
+                            "Gradient Check On %s" % str(place))
 
 
 if __name__ == '__main__':

From f0a85b08053440b9a49346f6d07cc106472c5c33 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 9 Aug 2017 18:03:39 +0800
Subject: [PATCH 48/51] Rename __is_close -> assert_is_close()

---
 python/paddle/v2/framework/tests/gradient_checker.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index 7c4eda5f30..aacc5e88fe 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -92,7 +92,8 @@ def get_numeric_gradient(op,
 
 
 class GradientChecker(unittest.TestCase):
-    def __is_close(self, numeric_grads, scope, max_relative_error, msg_prefix):
+    def assert_is_close(self, numeric_grads, scope, max_relative_error,
+                        msg_prefix):
         for name in numeric_grads:
             b = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
             a = numeric_grads[name]
@@ -199,8 +200,8 @@ class GradientChecker(unittest.TestCase):
             backward_op.infer_shape(scope)
             backward_op.run(scope, ctx)
 
-            self.__is_close(numeric_grad, scope, max_relative_error,
-                            "Gradient Check On %s" % str(place))
+            self.assert_is_close(numeric_grad, scope, max_relative_error,
+                                 "Gradient Check On %s" % str(place))
 
 
 if __name__ == '__main__':

From 840d0c74025306985a814c1480851f69923b580a Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 9 Aug 2017 18:11:21 +0800
Subject: [PATCH 49/51] Remove unnecessary C++ operator test

They are tested in Python
---
 paddle/operators/CMakeLists.txt  |  3 ---
 paddle/operators/add_op_test.cc  | 28 ----------------------------
 paddle/operators/mean_op_test.cc | 25 -------------------------
 paddle/operators/sgd_op_test.cc  | 22 ----------------------
 4 files changed, 78 deletions(-)
 delete mode 100644 paddle/operators/add_op_test.cc
 delete mode 100644 paddle/operators/mean_op_test.cc
 delete mode 100644 paddle/operators/sgd_op_test.cc

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 9e4026d1c6..af22229978 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -45,10 +45,8 @@ cc_library(net_op SRCS net_op.cc DEPS op_registry)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 
 op_library(add_op SRCS add_op.cc add_op.cu)
-cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
 
 op_library(mean_op SRCS mean_op.cc mean_op.cu)
-cc_test(mean_op_test SRCS mean_op_test.cc DEPS mean_op)
 
 op_library(mul_op SRCS mul_op.cc mul_op.cu)
 op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
@@ -59,7 +57,6 @@ op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
 op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)
 
 op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
-cc_test(sgd_op_test SRCS sgd_op_test.cc DEPS sgd_op)
 
 op_library(fc_op
     SRCS fc_op.cc
diff --git a/paddle/operators/add_op_test.cc b/paddle/operators/add_op_test.cc
deleted file mode 100644
index bf529defb2..0000000000
--- a/paddle/operators/add_op_test.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#define private public
-#include "paddle/framework/op_registry.h"
-
-USE_OP(add_two);
-
-TEST(AddOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("add_two");
-  ASSERT_NE(it, protos.end());
-  auto& op_creators = paddle::framework::OpRegistry::op_creators();
-  auto it1 = op_creators.find("add_two_grad");
-  ASSERT_NE(it1, op_creators.end());
-}
diff --git a/paddle/operators/mean_op_test.cc b/paddle/operators/mean_op_test.cc
deleted file mode 100644
index 375dcd50e1..0000000000
--- a/paddle/operators/mean_op_test.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-
-#include <paddle/framework/op_registry.h>
-
-USE_OP(mean);
-
-TEST(MeanOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("mean");
-  ASSERT_NE(it, protos.end());
-}
diff --git a/paddle/operators/sgd_op_test.cc b/paddle/operators/sgd_op_test.cc
deleted file mode 100644
index 75137259f5..0000000000
--- a/paddle/operators/sgd_op_test.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/framework/op_registry.h>
-USE_OP(sgd);
-TEST(SGDOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("sgd");
-  ASSERT_NE(it, protos.end());
-}

From f702e7977ddf571c7d23b8a3b26dc50e4731857b Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 9 Aug 2017 20:01:00 +0800
Subject: [PATCH 50/51] "relauch ci"

---
 python/paddle/v2/framework/tests/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 785a589c24..f6850e0651 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -21,7 +21,6 @@ py_test(gradient_checker SRCS gradient_checker.py)
 py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
 
 py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
-py_test(test_op_creation_methods SRCS test_op_creation_methods.py)
 
 py_test(test_operator SRCS test_operator.py)
 

From d4e4cebf5f95c0edd1788d81780491cd90e18236 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Wed, 9 Aug 2017 11:42:40 -0700
Subject: [PATCH 51/51] fix all coding-style problems

---
 paddle/operators/gather.h       | 20 +++-----------------
 paddle/operators/gather_test.cc |  2 --
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
index 8b02156545..0c73717d38 100644
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -20,13 +20,10 @@ limitations under the License. */
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/place.h"
 
-using paddle::framework::Tensor;
-using paddle::framework::DDim;
-
 namespace paddle {
 namespace operators {
 
-/* Implementation of CPU copy */
+// Implementation of CPU copy
 template <typename T>
 void CPUGather(const T* params, const int* indices, const int slice_size,
                const int index_size, T* output) {
@@ -34,15 +31,11 @@ void CPUGather(const T* params, const int* indices, const int slice_size,
 
   for (size_t i = 0; i < index_size; ++i) {
     int index_ = indices[i];
-    // copy src[index_] to output[i]
     memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
   }
 }
 
-/* Implementation of GPU copy:
-  I suppose the GPUDevice& d, contains gpu_id and thread_id
-  d = cuda_stream(gpu_id_, stream_id_);
-*/
+// Implementation of GPU copy:
 template <typename T>
 void GPUGather(const T* src, const int* index, const int slice_size,
                const int index_size, T* output);
@@ -62,7 +55,7 @@ void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
   int index_size = index->dims()[0];
 
   auto src_dims = src->dims();
-  DDim output_dims(src_dims);
+  paddle::framework::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
   // slice size
@@ -73,13 +66,6 @@ void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
   if (platform::is_cpu_place(place)) {
     CPUGather<T>(src->data<T>(), index->data<int>(), slice_size, index_size,
                  output->data<T>());
-  } else {
-    // init for GPU
-    // output_arr = output->mutable_data<T>(output_dims, platform::GPUPlace());
-    // how to specialize device??
-    // GPUGather(
-    //    d, src->data(), index->data(), slice_size,
-    //    new_tensor->mutable_data());
   }
 }
 
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
index 5d84b7b5f3..5de748ec46 100644
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -29,7 +29,6 @@ TEST(Gather, GatherData) {
   Tensor* src = new Tensor();
   Tensor* index = new Tensor();
   Tensor* output = new Tensor();
-  // src.Resize(make_ddim({3, 4}));
 
   int* p_src = nullptr;
   int* p_index = nullptr;
@@ -40,7 +39,6 @@ TEST(Gather, GatherData) {
   p_index[0] = 1;
   p_index[1] = 0;
 
-  // gather
   int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
 
   Gather<int>(CPUPlace(), src, index, output);