From 48dea84bf03971fafeb59eccf08d3237dc209690 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 6 Sep 2017 21:12:27 -0700
Subject: [PATCH 01/81] "nccl multigpu init"

---
 paddle/operators/nccl/nccl_gpu_common.h | 39 ++++++++++++++++++++
 paddle/operators/nccl/nccl_ops.cc       | 48 +++++++++++++++++++++++++
 paddle/operators/nccl/nccl_ops.h        |  7 ++++
 3 files changed, 94 insertions(+)
 create mode 100644 paddle/operators/nccl/nccl_gpu_common.h
 create mode 100644 paddle/operators/nccl/nccl_ops.cc
 create mode 100644 paddle/operators/nccl/nccl_ops.h
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
new file mode 100644
index 0000000000..017492a0d8
--- /dev/null
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -0,0 +1,39 @@
+#pragma once
+#include <nccl.h>
+
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+class NCCLManager {
+ public:
+  static NCCLManager* Get() {
+    static NCCLManager m;
+    return &m;
+  }
+
+  NCCLManager() { _comms.resize(_gpu_worlds.size()); }
+  ~NCCLManager() {}
+
+ private:
+  // clang-format off
+  std::vector<ncclComm_t> _comms;
+  std::vector<int> _gpu_worlds;
+  // clang-format on
+};
+
+class NCCLContext : public DeviceContext {
+ public:
+  explicit NCCLContext(GPUPlace place);
+  virtual ~NCCLContext();
+
+ private:
+  // clang-format off
+  std::vector<int> _gpu_ids;
+  std::vector<cudaStream_t> _streams;
+  int root_gpu;
+  // clang-format on
+};
+}
+}
diff --git a/paddle/operators/nccl/nccl_ops.cc b/paddle/operators/nccl/nccl_ops.cc
new file mode 100644
index 0000000000..a4bd8b9c0f
--- /dev/null
+++ b/paddle/operators/nccl/nccl_ops.cc
@@ -0,0 +1,48 @@
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+
+namespace paddle {
+namespace operators {
+
+// AllreduceOp
+class NCCLAllreduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  // allreduce do nothing in infershape
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
+};
+
+template <typename T>
+class NCCLAllreduceOp : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *ctx = static_cast<NCCLContext *>(context.device_context());
+    // auto *comm = ;
+    // auto *src = ;
+    // ncclAllReduce(src, dest, )
+  }
+};
+
+// BcastSendOp
+template <typename T>
+class NCCLBroadcastSendOp final : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
+};
+
+// BcastRecvOp
+template <typename T>
+class NCCLBroadcastRecvOp final : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
+};
+}
+}
diff --git a/paddle/operators/nccl/nccl_ops.h b/paddle/operators/nccl/nccl_ops.h
new file mode 100644
index 0000000000..0d78c60639
--- /dev/null
+++ b/paddle/operators/nccl/nccl_ops.h
@@ -0,0 +1,7 @@
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+
+namespace paddle {
+namespace operators {}
+}

From 408e21af92ec93b15207da557b1844733eee420a Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 20 Sep 2017 16:23:35 -0700
Subject: [PATCH 02/81] "remove clang format detect"

---
 paddle/operators/nccl/nccl_gpu_common.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 017492a0d8..55e7d8db66 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -17,10 +17,8 @@ class NCCLManager {
   ~NCCLManager() {}
 
  private:
-  // clang-format off
   std::vector<ncclComm_t> _comms;
   std::vector<int> _gpu_worlds;
-  // clang-format on
 };
 
 class NCCLContext : public DeviceContext {
@@ -29,11 +27,9 @@ class NCCLContext : public DeviceContext {
   virtual ~NCCLContext();
 
  private:
-  // clang-format off
   std::vector<int> _gpu_ids;
   std::vector<cudaStream_t> _streams;
   int root_gpu;
-  // clang-format on
 };
 }
 }

From 901b041196f006cd1fc4775a87849e6e716b6c62 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 11 Oct 2017 23:09:45 +0800
Subject: [PATCH 03/81] Add seq_expand op 1. Add unitest 2. Add
 SeqExpandOpKernel

---
 paddle/operators/seq_expand_op.cc             | 125 ++++++++++++++++++
 paddle/operators/seq_expand_op.cu             |  23 ++++
 paddle/operators/seq_expand_op.h              |  83 ++++++++++++
 .../v2/framework/tests/test_seq_expand.py     |  61 +++++++++
 4 files changed, 292 insertions(+)
 create mode 100644 paddle/operators/seq_expand_op.cc
 create mode 100644 paddle/operators/seq_expand_op.cu
 create mode 100644 paddle/operators/seq_expand_op.h
 create mode 100644 python/paddle/v2/framework/tests/test_seq_expand.py

diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
new file mode 100644
index 0000000000..894ba3f6b7
--- /dev/null
+++ b/paddle/operators/seq_expand_op.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/seq_expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class SeqExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SeqExpandOp should not be null.");
+    int repeat = ctx->Attrs().Get<int>("repeat");
+    DDim out_dim;
+    if (repeat == 0) {
+      PADDLE_ENFORCE(
+          ctx->HasInput("Y"),
+          "Input(Y) of SeqExpandOp should not be null while repeat == 0.");
+      out_dim = ctx->GetInputDim("Y");
+      ctx->ShareLoD("Y", "Out");
+    } else {
+      out_dim = ctx->GetInputDim("X");
+      out_dim[0] = out_dim[0] * repeat;
+      ctx->SetOutputDim("Out", y_dim);
+    }
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PadOp should not be null.");
+    ctx->SetOutputDim("Out", out_dim);
+  }
+};
+
+class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SeqExpandOpMaker(framework::OpProto* proto,
+                   framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    // TODO(wanghaoshuang): Add more comments
+    AddInput("X", "The input('X') of seq_expand op.");
+    AddInput("Y", "The reference input('Y') of seq_expand op.");
+    AddOutput("Out", "The output of seq_expand op.");
+    AddAttr<int>("repeat", "repeat times").SetDefault(0);
+    AddComment(R"DOC(
+As an example:
+
+Given:
+
+X = [1, 2 , 3]
+
+and
+
+repeat = 2
+
+
+then we get
+
+Out.data = [1, 1, 2, 2, 3, 3]
+Out.lod = [[0, 2, 4, 6]]
+
+)DOC");
+  }
+};
+
+class SeqExpandOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+class SeqExpandOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto* bind = new framework::OpDescBind();
+    bind->SetInput("X", Input("X"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    bind->SetAttrMap(Attrs());
+    bind->SetType("seq_expand_grad");
+    return std::unique_ptr<framework::OpDescBind>(bind);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker,
+                  ops::SeqExpandOpGradMaker);
+REGISTER_OPERATOR(seq_expand_grad, ops::SeqExpandOpGrad);
+REGISTER_OP_CPU_KERNEL(seq_expand,
+                       ops::SeqExpandKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    seq_expand_grad,
+    ops::SeqExpandGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/seq_expand_op.cu b/paddle/operators/seq_expand_op.cu
new file mode 100644
index 0000000000..f1e4b82a76
--- /dev/null
+++ b/paddle/operators/seq_expand_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/seq_expand_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(seq_expand,
+                       ops::SeqExpandKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    seq_expand_grad,
+    ops::SeqExpandGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
new file mode 100644
index 0000000000..80076dc35f
--- /dev/null
+++ b/paddle/operators/seq_expand_op.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "hl_cuda.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoD = paddle::framework::LoD;
+
+template <typename Place, typename T>
+class SeqExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    const T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    size_t repeat = static_cast<size_t>(context.Attr<int>("repeat"));
+
+    if (repeat != 0) {
+      if (x->lod().size() == 0) {
+        std::vector<size_t> level0(x->dims()[0]);
+        for (size_t i = 0; i <= x->dims()[0]; i++) {
+          level0.push_back(i * repeat);
+        }
+        const LoD out_lod;
+        out_lod.push_back(level0);
+        out->set_lod(out_lod);
+      }
+    }
+    auto out_dim = out->dims();
+    size_t element_len = framework::product(out_dim) / out_dim[0];
+    std::vector<int> cpy_map(out_dim[0]);
+    if (x->lod().size() == 0) {
+      auto lod = out->lod();
+      for (int i = 0; i < lod.size() - 1; ++i) {
+        for (int j = lod[0][i]; i < lod[0][i + 1]; ++j) {
+          cpy_map[j] = i;
+        }
+      }
+    }
+    if (paddle::platform::CPUPlace() == Place) {
+      for (int i = 0; i < out_dim[0]; ++i) {
+        memcpy(out_data + element_len * i, x_data + element_len * cpy_map[i],
+               sizeof(T) * element_len);
+      }
+    } else {
+      for (int i = 0; i < out_dim[0]; ++i) {
+        hl_memcpy(out_data + element_len * i, x_data + element_len * cpy_map[i],
+                  sizeof(T) * element_len);
+      }
+    }
+  }
+};
+
+template <typename Place, typename T>
+class SeqExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    // auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    // d_x->mutable_data<T>(context.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/framework/tests/test_seq_expand.py
new file mode 100644
index 0000000000..4608d3c3bd
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
@@ -0,0 +1,61 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSeqExpand(OpTest):
+    #class TestSeqExpand():
+    def set_data(self):
+        self.op_type = 'seq_expand'
+        x = np.random.uniform(0.1, 1, [3, 2, 2]).astype('float32')
+        y = np.zeros((6, 2, 2)).astype('float32')
+        lod = [[0, 2, 3, 6]]
+        print "x = %s" % x
+        self.inputs = {'X': x, 'Y': (y, lod)}
+        self.repeat = None
+
+    def compute(self):
+        x = self.inputs['X']
+        cpy_map = {}
+        lod = []
+        out_shape = []
+        if self.repeat:
+            level0 = []
+            for i in range(x.shape[0] + 1):
+                level0.append(i * self.repeat)
+            lod.append(level0)
+
+            for i in x.shape:
+                out_shape.append(i)
+            out_shape[0] = out_shape[0] * self.repeat
+        else:
+            y, lod = self.inputs['Y']
+            out_shape = y.shape
+        out = np.zeros(out_shape).astype('float32')
+
+        start = 0
+
+        for i in range(len(lod[0]) - 1):
+            for j in range(lod[0][i], lod[0][i + 1]):
+                cpy_map[j] = i
+        print "cpy_map = %s" % cpy_map
+        for i in range(len(out)):
+            out[i] = x[cpy_map[i]]
+
+        print "out = %s" % out
+        self.outputs = {'Out': (out, lod)}
+
+    def setUp(self):
+        self.set_data()
+        self.compute()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+if __name__ == '__main__':
+    unittest.main()
+#    TestSeqExpand().setUp()

From acd1aaea49e749a8d402bd6f744f2ca5f3de6020 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 12 Oct 2017 00:21:41 +0800
Subject: [PATCH 04/81] fix issues

---
 paddle/operators/seq_expand_op.cc |  3 +--
 paddle/operators/seq_expand_op.h  | 10 +++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index 894ba3f6b7..63b17a10f5 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -28,7 +28,7 @@ class SeqExpandOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SeqExpandOp should not be null.");
     int repeat = ctx->Attrs().Get<int>("repeat");
-    DDim out_dim;
+    framework::DDim out_dim;
     if (repeat == 0) {
       PADDLE_ENFORCE(
           ctx->HasInput("Y"),
@@ -38,7 +38,6 @@ class SeqExpandOp : public framework::OperatorWithKernel {
     } else {
       out_dim = ctx->GetInputDim("X");
       out_dim[0] = out_dim[0] * repeat;
-      ctx->SetOutputDim("Out", y_dim);
     }
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of PadOp should not be null.");
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index 80076dc35f..0c399fe196 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -21,7 +21,6 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using LoD = paddle::framework::LoD;
 
 template <typename Place, typename T>
 class SeqExpandKernel : public framework::OpKernel<T> {
@@ -35,11 +34,11 @@ class SeqExpandKernel : public framework::OpKernel<T> {
 
     if (repeat != 0) {
       if (x->lod().size() == 0) {
-        std::vector<size_t> level0(x->dims()[0]);
+        std::vector<size_t> level0;
         for (size_t i = 0; i <= x->dims()[0]; i++) {
           level0.push_back(i * repeat);
         }
-        const LoD out_lod;
+        framework::LoD out_lod;
         out_lod.push_back(level0);
         out->set_lod(out_lod);
       }
@@ -55,14 +54,15 @@ class SeqExpandKernel : public framework::OpKernel<T> {
         }
       }
     }
-    if (paddle::platform::CPUPlace() == Place) {
+    if (platform::is_cpu_place(context.GetPlace())) {
       for (int i = 0; i < out_dim[0]; ++i) {
         memcpy(out_data + element_len * i, x_data + element_len * cpy_map[i],
                sizeof(T) * element_len);
       }
     } else {
       for (int i = 0; i < out_dim[0]; ++i) {
-        hl_memcpy(out_data + element_len * i, x_data + element_len * cpy_map[i],
+        hl_memcpy(out_data + element_len * i,
+                  const_cast<T*>(x_data) + element_len * cpy_map[i],
                   sizeof(T) * element_len);
       }
     }

From 0fa34db7597e5f31c152bc6327df9a5ea4247b40 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Thu, 12 Oct 2017 04:24:26 +0000
Subject: [PATCH 05/81] nccl init

---
 paddle/operators/nccl/nccl_gpu_common.cc |  9 +++
 paddle/operators/nccl/nccl_gpu_common.h  | 53 +++++++++++++-----
 paddle/operators/nccl/nccl_ops.cc        | 70 ++++++++++++++++++++----
 paddle/operators/nccl/nccl_ops.h         | 55 ++++++++++++++++++-
 4 files changed, 161 insertions(+), 26 deletions(-)
 create mode 100644 paddle/operators/nccl/nccl_gpu_common.cc

diff --git a/paddle/operators/nccl/nccl_gpu_common.cc b/paddle/operators/nccl/nccl_gpu_common.cc
new file mode 100644
index 0000000000..0144d93969
--- /dev/null
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
@@ -0,0 +1,9 @@
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+
+namespace paddle {
+namespace platform {
+
+
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 55e7d8db66..cace878079 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -1,11 +1,31 @@
 #pragma once
 #include <nccl.h>
 
+#include <memory>
+#include <mutex>
+#include <condition_variable>
+#include <vector>
+#include <unordered_map>
+
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace platform {
 
+
+// class NCCLContext : public DeviceContext {
+// public:
+//   explicit NCCLContext(GPUPlace place);
+//   virtual ~NCCLContext();
+
+// private:
+//   std::vector<int> gpu_ids_;
+//   std::vector<cudaStream_t> streams_;
+// };
+
+
+class Communicator;
+
 class NCCLManager {
  public:
   static NCCLManager* Get() {
@@ -13,23 +33,28 @@ class NCCLManager {
     return &m;
   }
 
-  NCCLManager() { _comms.resize(_gpu_worlds.size()); }
+  NCCLManager() {
+  }
   ~NCCLManager() {}
 
+  // for each card only have one communicator
+  Communicator* GetCommunicator() const;
+
  private:
-  std::vector<ncclComm_t> _comms;
-  std::vector<int> _gpu_worlds;
-};
+  struct Communicator {
+    std::vector<ncclComm_t> comms_;
+    std::vector<cudaStream_t*> streams_; // do not own
+    std::vector<cudaEvent_t> events_;
+    int root_gpu;
+  };
 
-class NCCLContext : public DeviceContext {
- public:
-  explicit NCCLContext(GPUPlace place);
-  virtual ~NCCLContext();
+  // the gpu id list available. Note that only support
+  // whole world communication.
+  std::vector<int> _gpu_worlds;
 
- private:
-  std::vector<int> _gpu_ids;
-  std::vector<cudaStream_t> _streams;
-  int root_gpu;
+  // communicator list
+  std::unordered_map<std::string /* key*/, Communicator*> comms_;
 };
-}
-}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_ops.cc b/paddle/operators/nccl/nccl_ops.cc
index a4bd8b9c0f..4b7bfa7234 100644
--- a/paddle/operators/nccl/nccl_ops.cc
+++ b/paddle/operators/nccl/nccl_ops.cc
@@ -1,17 +1,28 @@
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/operators/nccl/nccl_ops.h"
 
 namespace paddle {
 namespace operators {
 
 // AllreduceOp
-class NCCLAllreduceOp : public framework::OperatorWithKernel {
+class NCCLAllReduceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   // allreduce do nothing in infershape
-  void InferShape(const framework::InferShapeContext &ctx) const override {}
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            " Input(X) of AllReduce op input should not be NULL");
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    PADDLE_ENFORCE(ins.size() == outs.size(), "Input(X) and Output(Out) must have same size");
+    for(size_t i=0; i < ins.size(); ++i) {
+      outs[i]->Resize(ins[i]->dims());
+    }
+    std::string reduction = ctx.Attr<std::string>("reduction");
+    PADDLE_ENFORCE( (reduction == "ncclSum" || reduction == "ncclProd" ||
+                     reduction == "ncclMin" || reduction == "ncclMax"), "invalid reduction!");
+  }
 };
 
 template <typename T>
@@ -19,30 +30,67 @@ class NCCLAllreduceOp : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *ctx = static_cast<NCCLContext *>(context.device_context());
-    // auto *comm = ;
-    // auto *src = ;
-    // ncclAllReduce(src, dest, )
   }
 };
 
 // BcastSendOp
 template <typename T>
-class NCCLBroadcastSendOp final : public framework::OperatorWithKernel {
+class NCCLBcastSendOp final : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {}
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            " Input(X) of BcastSend op input should not be NULL");
+  }
 };
 
 // BcastRecvOp
 template <typename T>
-class NCCLBroadcastRecvOp final : public framework::OperatorWithKernel {
+class NCCLBcastRecvOp final : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {}
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            " Input(X) of BcastRecv op input should not be NULL");
+  }
+};
+
+
+class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+  NCCLAllReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of AllReduce op");
+    AddOutput("Out", "The output of AllReduce op");
+    AddAttr<std::string>("reduction: {'min', 'max', 'prod', 'sum'}.");
+    AddComment(R"DOC(
+            AllReduce the input tensors.
+        )DOC");
+  }
 };
+
+class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
+  NCCLAllReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of BcastSend op");
+    AddComment(R"DOC(
+            BcastSend the tensors.
+        )DOC");
+  }
+};
+
+class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+  NCCLAllReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "The output of BcastRecv op");
+    AddComment(R"DOC(
+            BcastRecv the tensors.
+        )DOC");
+  }
+};
+
 }
 }
diff --git a/paddle/operators/nccl/nccl_ops.h b/paddle/operators/nccl/nccl_ops.h
index 0d78c60639..3664d2f55c 100644
--- a/paddle/operators/nccl/nccl_ops.h
+++ b/paddle/operators/nccl/nccl_ops.h
@@ -2,6 +2,59 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 
+#include <string.h>
+
 namespace paddle {
-namespace operators {}
+namespace operators {
+
+
+template<typename Type>
+class NCCLTypeWrapper;
+
+template<>
+class NCCLTypeWrapper<float> {
+  static const ncclDataType_t type = ncclFloat;
+};
+
+template<>
+class NCCLTypeWrapper<double> {
+  static const ncclDataType_t type = ncclDouble;
+};
+
+
+
+template<typename T>
+class NCCLAllReduceKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto outs = ctx.MultiOutput<Tensor>("Out");
+    std::string reduction = ctx.Attr<std::string>("reduction");
+    ncclRedOp_t op_type;
+    if (reduction == "ncclSum") {
+      op_type = ncclSum;
+    } else if (reduction == "ncclProd") {
+      op_type = ncclProd;
+    } else if (reduction == "ncclMin") {
+      op_type = ncclMin;
+    } else (reduction == "ncclMax") {
+      op_type = ncclMax;
+    }
+
+    auto dev_ctx = ctx.device_context();
+
+    for( size_t i=0; i < ins.size(); ++i) {
+      ncclAllReduce(ins[i]->data<T>(),
+                    outs[i]->mutable_data<T>(),
+                    outs[i]->numel() * sizeof(T),
+                    NCCLTypeWrapper<T>::type,
+                    op_type,
+                    comm,
+                    stream);
+    }
+  }
+};
+
+
+}
 }

From 51abb6c323aca14722fa79b24dfafc6b23494509 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Thu, 12 Oct 2017 14:55:14 -0700
Subject: [PATCH 06/81] add test

---
 .../paddle/v2/framework/tests/test_nccl_ops.py  | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 python/paddle/v2/framework/tests/test_nccl_ops.py

diff --git a/python/paddle/v2/framework/tests/test_nccl_ops.py b/python/paddle/v2/framework/tests/test_nccl_ops.py
new file mode 100644
index 0000000000..128a9ab21a
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_nccl_ops.py
@@ -0,0 +1,17 @@
+import unittest, os
+import numpy as np
+import paddle.v2 as paddle
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+from op_test import OpTest, create_op
+
+gpu_list = os.environ["NV_LIST"]
+
+if not core.is_compile_gpu() or not gpu_list:
+    exit(0)
+
+
+class TestNCCLAllReduce(unittest.TestCase):
+    def __init__(self):
+        self.op_type = "nnclAllReduce"
+        self.scope = core.Scope()

From d144310415c04966746bfd1b9315fbfa36a81b11 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Fri, 13 Oct 2017 16:03:26 -0700
Subject: [PATCH 07/81] "nccl add interface"

---
 paddle/operators/CMakeLists.txt               |  1 +
 paddle/operators/nccl/CMakeLists.txt          |  8 ++
 paddle/operators/nccl/nccl_gpu_common.cc      | 49 ++++++++++
 paddle/operators/nccl/nccl_gpu_common.h       | 92 +++++++++++++++----
 paddle/operators/nccl/nccl_gpu_common_test.cc | 23 +++++
 paddle/operators/nccl/nccl_ops.cc             | 57 ++++++------
 paddle/operators/nccl/nccl_ops.h              | 58 +++++++-----
 paddle/platform/place.h                       |  1 +
 .../v2/framework/tests/test_nccl_ops.py       | 60 +++++++++++-
 9 files changed, 279 insertions(+), 70 deletions(-)
 create mode 100644 paddle/operators/nccl/CMakeLists.txt
 create mode 100644 paddle/operators/nccl/nccl_gpu_common_test.cc

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index ad941bde2b..702a71d755 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -106,6 +106,7 @@ function(op_library TARGET)
 endfunction()
 
 add_subdirectory(math)
+add_subdirectory(nccl)
 
 set(DEPS_OPS
     recurrent_op
diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/operators/nccl/CMakeLists.txt
new file mode 100644
index 0000000000..05c27f08fe
--- /dev/null
+++ b/paddle/operators/nccl/CMakeLists.txt
@@ -0,0 +1,8 @@
+if(WITH_GPU)
+  nv_library(nccl_common SRCS nccl_gpu_common DEPS device_context operator)
+  nv_library(nccl_op SRCS nccl_ops.cc DEPS nccl_common)
+else()
+  cc_library(nccl_common SRCS nccl_gpu_common DEPS device_context operator)
+endif()
+
+cc_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
diff --git a/paddle/operators/nccl/nccl_gpu_common.cc b/paddle/operators/nccl/nccl_gpu_common.cc
index 0144d93969..492d79ca53 100644
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
@@ -1,9 +1,58 @@
 #include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/gpu_info.h"
 
 namespace paddle {
 namespace platform {
 
+NCCLManager::NCCLManager() {}
 
+NCCLManager::~NCCLManager() {
+  for (auto& p : comm_table) {
+    auto* comm = p.second;
+    auto& gpus_ = comm->gpus_;
+    for (int i = 0; i < gpus_.size(); ++i) {
+      int gid = gpus_[i];
+      platform::SetDeviceId(gid);
+
+      // mapping gid to idx
+      int idx = gid % gpus_.size();
+      // wait finish
+      NCCL_CHECK(
+          cudaStreamWaitEvent(*comm->streams_[idx], comm->events_[idx], 0));
+
+      NCCL_CHECK(cudaEventDestroy(comm->events_[idx]));
+
+      NCCL_CHECK(ncclCommDestroy(comm->comms_[idx]));
+    }
+    delete comm;
+  }
+}
+
+Communicator* NCCLManager::GetCommunicator(const std::vector<int>& gpus) const {
+  std::string key;
+  for (auto& id : gpus) {
+    key += std::to_string(id);
+  }
+  std::sort(key.begin(), key.end());
+
+  std::mutex mu;
+  std::lock_guard<std::mutex> lk(mu);
+  auto* comm = comm_table[key];
+  if (comm == nullptr) {
+    comm = new Communicator(gpus.size());
+    NCCL_CHECK(ncclCommInitAll(comm->comms_.data(), gpus.size(), gpus.data()));
+
+    for (size_t i = 0; i < gpus.size(); ++i) {
+      platform::SetDeviceId(gpus[i]);
+
+      // block wait
+      NCCL_CHECK(cudaEventCreateWithFlags(
+          &events_[i], cudaEventBlockingSync | cudaEventDisableTiming));
+    }
+    comm_table[key] = comm;
+  }
+  return comm;
+}
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index cace878079..a50490f392 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -1,17 +1,62 @@
 #pragma once
 #include <nccl.h>
 
+#include <algorithm>
+#include <condition_variable>
 #include <memory>
 #include <mutex>
-#include <condition_variable>
-#include <vector>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace platform {
 
+#define NCCL_CHECK(condition)                                             \
+  do {                                                                    \
+    ncclResult_t ret = (condition);                                       \
+    PADDLE_ENFORCE(ret == ncclSuccess, "Error invoking NCCL: ", __FILE__, \
+                   __LINE__, ncclGetErrorString(ret));                    \
+  } while (0)
+
+class WaitGroup {
+ public:
+  inline void Add(int n) {
+    std::unique_lock<std::mutex> lk(mu_);
+    PADDLE_ENFORCE(n >= 0, "add wait must >=0.");
+    counter_ += n;
+  }
+
+  inline void Done(int n) {
+    std::unique_lock<std::mutex> lk(mu_);
+    PADDLE_ENFORCE(n <= counter_, " wait group done unmatch to add.");
+    counter_ -= n;
+    if (counter_ == 0) {
+      cv_.notify_all();
+    }
+  }
+
+  inline void Add() { Add(1); }
+
+  inline void Done() { Done(1); }
+
+  inline void Wait() {
+    std::unique_lock<std::mutex> lk(mu_);
+    cv_.wait(lk, [&] { return counter_ == 0; });
+  }
+
+  inline int GetCount() {
+    std::unique_lock<std::mutex> lk(mu_);
+    return counter_;
+  }
+
+ private:
+  int counter_ = 0;
+  std::mutex mu_;
+  std::condition_variable cv_;
+};
 
 // class NCCLContext : public DeviceContext {
 // public:
@@ -23,8 +68,26 @@ namespace platform {
 //   std::vector<cudaStream_t> streams_;
 // };
 
+// TODO(dzh) : make resources managed unified with framework
+struct Communicator {
+  std::vector<ncclComm_t> comms_;
+  std::vector<cudaStream_t*> streams_;
+  std::vector<cudaEvent_t> events_;
+  std::vector<int> gpus_;
+  WaitGroup wg_;
+  int root_gpu = -1;
+  // cudaEvent_t root_monitor;
+  explicit Communicator(const std::vector<int>& gpus) : gpus_(gpus) {
+    comms_.resize(gpus.size());
+    streams_.resize(gpus.size());
+    events_.resize(gpus.size());
+  }
+  // Communicator(int num_device): comms_.resize(num_device) {}
+
+  inline int get_root_gpu() const { return root_gpu; }
 
-class Communicator;
+  inline void set_root_gpu(int id) { root_gpu = id; }
+};
 
 class NCCLManager {
  public:
@@ -33,27 +96,20 @@ class NCCLManager {
     return &m;
   }
 
-  NCCLManager() {
-  }
-  ~NCCLManager() {}
+  NCCLManager();
+
+  ~NCCLManager();
 
   // for each card only have one communicator
-  Communicator* GetCommunicator() const;
+  Communicator* GetCommunicator(const std::vector<int>& gpus) const;
 
  private:
-  struct Communicator {
-    std::vector<ncclComm_t> comms_;
-    std::vector<cudaStream_t*> streams_; // do not own
-    std::vector<cudaEvent_t> events_;
-    int root_gpu;
-  };
-
-  // the gpu id list available. Note that only support
-  // whole world communication.
-  std::vector<int> _gpu_worlds;
+  // // the gpu id list available. Note that only support
+  // // whole world communication.
+  // std::vector<int> _gpu_worlds;
 
   // communicator list
-  std::unordered_map<std::string /* key*/, Communicator*> comms_;
+  std::unordered_map<std::string /* key*/, Communicator*> comm_table;
 };
 
 }  // namespace operators
diff --git a/paddle/operators/nccl/nccl_gpu_common_test.cc b/paddle/operators/nccl/nccl_gpu_common_test.cc
new file mode 100644
index 0000000000..9b46ea31ba
--- /dev/null
+++ b/paddle/operators/nccl/nccl_gpu_common_test.cc
@@ -0,0 +1,23 @@
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <thread>
+#include <vector>
+
+TEST(WaitGroup, wait) {
+  WaitGroup wg;
+  auto run_thread = [](int idx) {
+    wg.Add(1);
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+    wg.Done();
+  };
+
+  std::vector<std::thread> ths;
+  constexpr const int TNUM = 5;
+  for (int i = 0; i < TNUM; ++i) {
+    ths.emplace_back(std::thread(run_thread, i));
+  }
+  wg.Wait();
+}
diff --git a/paddle/operators/nccl/nccl_ops.cc b/paddle/operators/nccl/nccl_ops.cc
index 4b7bfa7234..ccb22f3052 100644
--- a/paddle/operators/nccl/nccl_ops.cc
+++ b/paddle/operators/nccl/nccl_ops.cc
@@ -11,25 +11,20 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
  protected:
   // allreduce do nothing in infershape
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            " Input(X) of AllReduce op input should not be NULL");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("X"),
+        " Input(X) of AllReduce op input should not be NULL");
     auto ins = ctx.MultiInput<framework::Tensor>("X");
     auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    PADDLE_ENFORCE(ins.size() == outs.size(), "Input(X) and Output(Out) must have same size");
-    for(size_t i=0; i < ins.size(); ++i) {
+    PADDLE_ENFORCE(ins.size() == outs.size(),
+                   "Input(X) and Output(Out) must have same size");
+    for (size_t i = 0; i < ins.size(); ++i) {
       outs[i]->Resize(ins[i]->dims());
     }
     std::string reduction = ctx.Attr<std::string>("reduction");
-    PADDLE_ENFORCE( (reduction == "ncclSum" || reduction == "ncclProd" ||
-                     reduction == "ncclMin" || reduction == "ncclMax"), "invalid reduction!");
-  }
-};
-
-template <typename T>
-class NCCLAllreduceOp : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *ctx = static_cast<NCCLContext *>(context.device_context());
+    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+                    reduction == "ncclMin" || reduction == "ncclMax"),
+                   "invalid reduction!");
   }
 };
 
@@ -41,8 +36,9 @@ class NCCLBcastSendOp final : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            " Input(X) of BcastSend op input should not be NULL");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("X"),
+        " Input(X) of BcastSend op input should not be NULL");
   }
 };
 
@@ -54,18 +50,21 @@ class NCCLBcastRecvOp final : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            " Input(X) of BcastRecv op input should not be NULL");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        " Input(X) of BcastRecv op input should not be NULL");
   }
 };
 
-
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
-  NCCLAllReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+  NCCLAllReduceOpMaker(framework::OpProto *proto,
+                       framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of AllReduce op");
     AddOutput("Out", "The output of AllReduce op");
-    AddAttr<std::string>("reduction: {'min', 'max', 'prod', 'sum'}.");
+    AddAttr<std::string>("reduction",
+                         "{'ncclmin', 'ncclmax', 'ncclprod', 'ncclsum'}.");
+    AddAttr<std::vector<int>>("gpus", "gpu id lists");
     AddComment(R"DOC(
             AllReduce the input tensors.
         )DOC");
@@ -73,8 +72,9 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 };
 
 class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
-  NCCLAllReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+  NCCLAllReduceOpMaker(framework::OpProto *proto,
+                       framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of BcastSend op");
     AddComment(R"DOC(
             BcastSend the tensors.
@@ -83,8 +83,9 @@ class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
 };
 
 class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
-  NCCLAllReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+  NCCLAllReduceOpMaker(framework::OpProto *proto,
+                       framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "The output of BcastRecv op");
     AddComment(R"DOC(
             BcastRecv the tensors.
@@ -92,5 +93,5 @@ class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-}
-}
+}  // operators
+}  // paddle
diff --git a/paddle/operators/nccl/nccl_ops.h b/paddle/operators/nccl/nccl_ops.h
index 3664d2f55c..7e348a601a 100644
--- a/paddle/operators/nccl/nccl_ops.h
+++ b/paddle/operators/nccl/nccl_ops.h
@@ -7,29 +7,27 @@
 namespace paddle {
 namespace operators {
 
-
-template<typename Type>
+template <typename Type>
 class NCCLTypeWrapper;
 
-template<>
+template <>
 class NCCLTypeWrapper<float> {
   static const ncclDataType_t type = ncclFloat;
 };
 
-template<>
+template <>
 class NCCLTypeWrapper<double> {
   static const ncclDataType_t type = ncclDouble;
 };
 
-
-
-template<typename T>
+template <typename T>
 class NCCLAllReduceKernel : public framework::OpKernel {
-public:
+ public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto ins = ctx.MultiInput<Tensor>("X");
     auto outs = ctx.MultiOutput<Tensor>("Out");
     std::string reduction = ctx.Attr<std::string>("reduction");
+    std::vector<int> gpus = ctx.Attr<std::vector<int>>("gpus");
     ncclRedOp_t op_type;
     if (reduction == "ncclSum") {
       op_type = ncclSum;
@@ -37,24 +35,40 @@ public:
       op_type = ncclProd;
     } else if (reduction == "ncclMin") {
       op_type = ncclMin;
-    } else (reduction == "ncclMax") {
-      op_type = ncclMax;
-    }
+    } else
+      (reduction == "ncclMax") { op_type = ncclMax; }
+
+    auto dev_ctx =
+        static_cast<const platform::CUDADeviceContext>(ctx.device_context());
+
+    NCCLManager* m = NCCLManager::Get();
+
+    auto* comm = m->GetCommunicator(gpus);
+    comm->wg_.Add(1);
 
-    auto dev_ctx = ctx.device_context();
+    auto* stream = &dev_ctx.stream();
 
-    for( size_t i=0; i < ins.size(); ++i) {
-      ncclAllReduce(ins[i]->data<T>(),
-                    outs[i]->mutable_data<T>(),
-                    outs[i]->numel() * sizeof(T),
-                    NCCLTypeWrapper<T>::type,
-                    op_type,
-                    comm,
-                    stream);
+    // device id
+    int gid = ctx.GetPlace().GetDeviceId();
+    int idx = gid % gpus.size();
+    comm->streams_[idx] = stream;
+
+    for (size_t i = 0; i < ins.size(); ++i) {
+      NCCL_CHECK(ncclAllReduce(ins[i]->data<T>(), outs[i]->mutable_data<T>(),
+                               outs[i]->numel() * sizeof(T),
+                               NCCLTypeWrapper<T>::type, op_type,
+                               &comm->comms_[idx], comm->streams_[idx]));
+      NCCL_CHECK(cudaEventRecord(comm->events_[idx], *comms_->streams_[idx]));
+
+      // wait finish
+      NCCL_CHECK(
+          cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
     }
-  }
-};
 
+    comm->wg_.Done();
 
+    wg.Wait();
+  }
+};
 }
 }
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 0efc693234..5370360a7d 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -35,6 +35,7 @@ struct GPUPlace {
   GPUPlace() : GPUPlace(0) {}
   explicit GPUPlace(int d) : device(d) {}
 
+  inline int GetDeviceId() const { return device; }
   // needed for variant equality comparison
   inline bool operator==(const GPUPlace &o) const { return device == o.device; }
   inline bool operator!=(const GPUPlace &o) const { return !(*this == o); }
diff --git a/python/paddle/v2/framework/tests/test_nccl_ops.py b/python/paddle/v2/framework/tests/test_nccl_ops.py
index 128a9ab21a..9bfa4c74d4 100644
--- a/python/paddle/v2/framework/tests/test_nccl_ops.py
+++ b/python/paddle/v2/framework/tests/test_nccl_ops.py
@@ -3,7 +3,7 @@ import numpy as np
 import paddle.v2 as paddle
 from paddle.v2.framework.op import Operator
 import paddle.v2.framework.core as core
-from op_test import OpTest, create_op
+from op_test import OpTest, create_op, set_input
 
 gpu_list = os.environ["NV_LIST"]
 
@@ -11,7 +11,63 @@ if not core.is_compile_gpu() or not gpu_list:
     exit(0)
 
 
+def allreduce(tensors, num_device):
+    assert (len(tensors) == num_device), "not match of tensor and device"
+    Out = tensors
+    for i in range(1, len(tensors)):
+        Out[0] += Out[i]
+
+    for i in range(1, len(tensors)):
+        Out[i] = Out[0]
+
+    return Out
+
+
 class TestNCCLAllReduce(unittest.TestCase):
     def __init__(self):
         self.op_type = "nnclAllReduce"
-        self.scope = core.Scope()
+
+        self.gpus = [int(g) for g in gpu_list]
+
+        self.scopes = []
+        self.ops = []
+        self.places = []
+
+        self.input_data = []
+        for i in range(len(self.gpus)):
+            input_data.append(np.random.random((32, 32)))
+        self.output_data = allreduce(input_data)
+
+        for i in range(len(self.gpus)):
+            scope = core.Scope()
+            place = core.GPUPlace(self.gpus[i])
+            inputs = {"X": self.input_data[i]}
+            outputs = {"Out": self.output_data[i]}
+            attrs = {"gpus": self.gpus}
+
+            op = create_op(scope, self.op_type, inputs, outputs, attrs)
+            set_input(scope, op, inputs, place)
+
+            self.scopes.append(scope)
+            self.ops.append(op)
+            self.places.append(place)
+
+    def test_output(self):
+        idx = 0
+        for scope, place, op in zip(self.scopes, self.places, self.ops):
+            ctx = core.DeviceContext.create(place)
+            op.run(scope, ctx)
+
+        for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
+            actual = np.array(scope.find_var(out_name).get_tensor())
+            expect = self.output_data[idx]
+
+            idx += 1
+            self.assertTrue(actual, expect), "has diff"
+
+
+if __name__ == "__main__":
+    # usage : export NV_LIST=0,1,2,3 python *.py
+
+    os.environ["NV_LIST"] = ["0,1,2,3"]
+    unittest.main()

From 54d3dbd8c93c7a28fc61a66a363b98150756096b Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Sat, 14 Oct 2017 10:50:50 -0700
Subject: [PATCH 08/81] "add enforce check"

---
 paddle/platform/dynload/CMakeLists.txt    |  2 +-
 paddle/platform/dynload/dynamic_loader.cc | 13 ++++
 paddle/platform/dynload/dynamic_loader.h  |  8 +++
 paddle/platform/dynload/nccl.cc           | 30 ++++++++++
 paddle/platform/dynload/nccl.h            | 72 +++++++++++++++++++++++
 paddle/platform/enforce.h                 | 12 ++++
 6 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 paddle/platform/dynload/nccl.cc
 create mode 100644 paddle/platform/dynload/nccl.h

diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index ceb66f84b6..4c8be33480 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
-nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc DEPS dynamic_loader)
+nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc DEPS dynamic_loader)
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
index ae9a0a982c..5c2ee2e5fc 100644
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -35,6 +35,11 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
 DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
 
+DEFINE_string(nccl_dir, "",
+              "Specify path for loading nccl library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -157,6 +162,14 @@ void GetLapackDsoHandle(void** dso_handle) {
 #endif
 }
 
+void GetNcclDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/platform/dynload/dynamic_loader.h
index a99b05443f..b9483890be 100644
--- a/paddle/platform/dynload/dynamic_loader.h
+++ b/paddle/platform/dynload/dynamic_loader.h
@@ -58,6 +58,14 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  */
 void GetLapackDsoHandle(void** dso_handle);
 
+/**
+ * @brief    load the DSO of NVIDIA nccl
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetNcclDsoHandle(void** dso_handle);
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.cc b/paddle/platform/dynload/nccl.cc
new file mode 100644
index 0000000000..8f92b8d94d
--- /dev/null
+++ b/paddle/platform/dynload/nccl.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/platform/dynload/nccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag nccl_dso_flag;
+void *nccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
new file mode 100644
index 0000000000..ad050da4ad
--- /dev/null
+++ b/paddle/platform/dynload/nccl.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <nccl.h>
+#include <mutex>
+#include "paddle/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag nccl_dso_flag;
+extern void* nccl_dso_handle;
+
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                    \
+  struct DynLoad__##__name {                                      \
+    template <typename... Args>                                   \
+    ncclResult_t operator()(Args... args) {                       \
+      typedef ncclResult_t (*ncclFunc)(Args...);                  \
+      std::call_once(nccl_dso_flag,                               \
+                     paddle::platform::dynload::GetNcclDsoHandle, \
+                     &nccl_dso_handle);                           \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
+      return reinterpret_cast<ncclFunc>(p_##__name)(args...);     \
+    }                                                             \
+  };                                                              \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
+  struct DynLoad__##__name {                   \
+    template <typename... Args>                \
+    ncclResult_t operator()(Args... args) {    \
+      return __name(args...);                  \
+    }                                          \
+  };                                           \
+  extern DynLoad__##__name __name
+#endif
+
+#define NCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(ncclCommInitAll);             \
+  __macro(ncclGetUniqueId);             \
+  __macro(ncclCommInitRank);            \
+  __macro(ncclCommDestroy);             \
+  __macro(ncclCommCount);               \
+  __macro(ncclCommCuDevice);            \
+  __macro(ncclCommUserRank);            \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclReduce);                  \
+  __macro(ncclGetErrorString);
+
+NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index cd906c3fa9..2f9e7466f1 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/dynload/nccl.h"
 
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -172,6 +173,17 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   throw std::runtime_error(err + string::Sprintf(args...));
 }
 
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    ncclResult_t stat, const Args&... args) {
+  if (stat == ncclSuccess) {
+    return;
+  } else {
+    throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
+                             string::Sprintf(args...));
+  }
+}
+
 #endif  // PADDLE_ONLY_CPU
 
 template <typename T>

From d8aebaf50c38c88a05728f3bb915da7e767ff496 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Sun, 15 Oct 2017 13:28:05 -0700
Subject: [PATCH 09/81] "fix enforce error"

---
 paddle/operators/nccl/nccl_gpu_common.cc | 33 +++++++++++++-----------
 paddle/operators/nccl/nccl_gpu_common.h  | 14 +++-------
 paddle/operators/nccl/nccl_ops.h         | 13 +++++-----
 paddle/platform/dynload/nccl.h           |  8 +++---
 paddle/platform/enforce.h                |  2 ++
 5 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/paddle/operators/nccl/nccl_gpu_common.cc b/paddle/operators/nccl/nccl_gpu_common.cc
index 492d79ca53..80cb66300e 100644
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
@@ -8,27 +8,27 @@ NCCLManager::NCCLManager() {}
 
 NCCLManager::~NCCLManager() {
   for (auto& p : comm_table) {
-    auto* comm = p.second;
+    auto& comm = p.second;
     auto& gpus_ = comm->gpus_;
-    for (int i = 0; i < gpus_.size(); ++i) {
+    for (size_t i = 0; i < gpus_.size(); ++i) {
       int gid = gpus_[i];
       platform::SetDeviceId(gid);
 
       // mapping gid to idx
       int idx = gid % gpus_.size();
       // wait finish
-      NCCL_CHECK(
+      PADDLE_ENFORCE(
           cudaStreamWaitEvent(*comm->streams_[idx], comm->events_[idx], 0));
 
-      NCCL_CHECK(cudaEventDestroy(comm->events_[idx]));
+      PADDLE_ENFORCE(cudaEventDestroy(comm->events_[idx]));
 
-      NCCL_CHECK(ncclCommDestroy(comm->comms_[idx]));
+      PADDLE_ENFORCE(ncclCommDestroy(comm->comms_[idx]));
     }
-    delete comm;
+    comm.reset(nullptr);
   }
 }
 
-Communicator* NCCLManager::GetCommunicator(const std::vector<int>& gpus) const {
+Communicator* NCCLManager::GetCommunicator(const std::vector<int>& gpus) {
   std::string key;
   for (auto& id : gpus) {
     key += std::to_string(id);
@@ -37,21 +37,24 @@ Communicator* NCCLManager::GetCommunicator(const std::vector<int>& gpus) const {
 
   std::mutex mu;
   std::lock_guard<std::mutex> lk(mu);
-  auto* comm = comm_table[key];
-  if (comm == nullptr) {
-    comm = new Communicator(gpus.size());
-    NCCL_CHECK(ncclCommInitAll(comm->comms_.data(), gpus.size(), gpus.data()));
+
+  auto it = comm_table.find(key);
+
+  if (it->second == nullptr) {
+    auto* comm = new Communicator(gpus);
+    PADDLE_ENFORCE(
+        ncclCommInitAll(comm->comms_.data(), gpus.size(), gpus.data()));
 
     for (size_t i = 0; i < gpus.size(); ++i) {
       platform::SetDeviceId(gpus[i]);
 
       // block wait
-      NCCL_CHECK(cudaEventCreateWithFlags(
-          &events_[i], cudaEventBlockingSync | cudaEventDisableTiming));
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(
+          &comm->events_[i], cudaEventBlockingSync | cudaEventDisableTiming));
     }
-    comm_table[key] = comm;
+    comm_table[key].reset(comm);
   }
-  return comm;
+  return comm_table[key].get();
 }
 
 }  // namespace operators
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index a50490f392..96b3bb801a 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -1,5 +1,4 @@
 #pragma once
-#include <nccl.h>
 
 #include <algorithm>
 #include <condition_variable>
@@ -10,17 +9,11 @@
 #include <vector>
 
 #include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 
-#define NCCL_CHECK(condition)                                             \
-  do {                                                                    \
-    ncclResult_t ret = (condition);                                       \
-    PADDLE_ENFORCE(ret == ncclSuccess, "Error invoking NCCL: ", __FILE__, \
-                   __LINE__, ncclGetErrorString(ret));                    \
-  } while (0)
-
 class WaitGroup {
  public:
   inline void Add(int n) {
@@ -101,7 +94,7 @@ class NCCLManager {
   ~NCCLManager();
 
   // for each card only have one communicator
-  Communicator* GetCommunicator(const std::vector<int>& gpus) const;
+  Communicator* GetCommunicator(const std::vector<int>& gpus);
 
  private:
   // // the gpu id list available. Note that only support
@@ -109,7 +102,8 @@ class NCCLManager {
   // std::vector<int> _gpu_worlds;
 
   // communicator list
-  std::unordered_map<std::string /* key*/, Communicator*> comm_table;
+  std::unordered_map<std::string /* key*/, std::unique_ptr<Communicator>>
+      comm_table;
 };
 
 }  // namespace operators
diff --git a/paddle/operators/nccl/nccl_ops.h b/paddle/operators/nccl/nccl_ops.h
index 7e348a601a..894859f6f0 100644
--- a/paddle/operators/nccl/nccl_ops.h
+++ b/paddle/operators/nccl/nccl_ops.h
@@ -54,14 +54,15 @@ class NCCLAllReduceKernel : public framework::OpKernel {
     comm->streams_[idx] = stream;
 
     for (size_t i = 0; i < ins.size(); ++i) {
-      NCCL_CHECK(ncclAllReduce(ins[i]->data<T>(), outs[i]->mutable_data<T>(),
-                               outs[i]->numel() * sizeof(T),
-                               NCCLTypeWrapper<T>::type, op_type,
-                               &comm->comms_[idx], comm->streams_[idx]));
-      NCCL_CHECK(cudaEventRecord(comm->events_[idx], *comms_->streams_[idx]));
+      PADDLE_ENFORCE(
+          ncclAllReduce(ins[i]->data<T>(), outs[i]->mutable_data<T>(),
+                        outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type,
+                        op_type, &comm->comms_[idx], comm->streams_[idx]));
+      PADDLE_ENFORCE(
+          cudaEventRecord(comm->events_[idx], *comms_->streams_[idx]));
 
       // wait finish
-      NCCL_CHECK(
+      PADDLE_ENFORCE(
           cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
     }
 
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
index ad050da4ad..fbfcec4c98 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -30,13 +30,13 @@ extern void* nccl_dso_handle;
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                    \
   struct DynLoad__##__name {                                      \
     template <typename... Args>                                   \
-    ncclResult_t operator()(Args... args) {                       \
-      typedef ncclResult_t (*ncclFunc)(Args...);                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) {  \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);   \
       std::call_once(nccl_dso_flag,                               \
                      paddle::platform::dynload::GetNcclDsoHandle, \
                      &nccl_dso_handle);                           \
       void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
-      return reinterpret_cast<ncclFunc>(p_##__name)(args...);     \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);    \
     }                                                             \
   };                                                              \
   extern DynLoad__##__name __name
@@ -65,7 +65,7 @@ extern void* nccl_dso_handle;
   __macro(ncclReduce);                  \
   __macro(ncclGetErrorString);
 
-NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP);
+NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 2f9e7466f1..bfe708748a 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -29,6 +29,8 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif
 
+#include <glog/logging.h>
+
 #ifdef PADDLE_WITH_CUDA
 
 #include "paddle/platform/dynload/cublas.h"

From 5bcb63800e602ed2c63c63ee5f82e986f645c960 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Sun, 15 Oct 2017 13:34:52 -0700
Subject: [PATCH 10/81] "fix common test"

---
 paddle/operators/nccl/nccl_gpu_common.h       | 16 +++++++++++++++-
 paddle/operators/nccl/nccl_gpu_common_test.cc | 12 +++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 96b3bb801a..4a375fcc36 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once
 
 #include <algorithm>
@@ -106,5 +120,5 @@ class NCCLManager {
       comm_table;
 };
 
-}  // namespace operators
+}  // namespace platform
 }  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_gpu_common_test.cc b/paddle/operators/nccl/nccl_gpu_common_test.cc
index 9b46ea31ba..6f6a4ac886 100644
--- a/paddle/operators/nccl/nccl_gpu_common_test.cc
+++ b/paddle/operators/nccl/nccl_gpu_common_test.cc
@@ -6,9 +6,12 @@
 #include <thread>
 #include <vector>
 
+namespace paddle {
+namespace platform {
+
 TEST(WaitGroup, wait) {
   WaitGroup wg;
-  auto run_thread = [](int idx) {
+  auto run_thread = [&wg](int idx) {
     wg.Add(1);
     std::this_thread::sleep_for(std::chrono::seconds(1));
     wg.Done();
@@ -20,4 +23,11 @@ TEST(WaitGroup, wait) {
     ths.emplace_back(std::thread(run_thread, i));
   }
   wg.Wait();
+
+  for (int i = 0; i < TNUM; ++i) {
+    ths[i].join();
+  }
 }
+
+}  // namespace platform
+}  // namespace paddle

From 73883bde2ad6a4fd0338df10da7af7d4b993f1b2 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Sun, 15 Oct 2017 14:27:22 -0700
Subject: [PATCH 11/81] "fix error"

---
 paddle/operators/nccl/nccl_ops.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/nccl/nccl_ops.h b/paddle/operators/nccl/nccl_ops.h
index 894859f6f0..f56b89d2ad 100644
--- a/paddle/operators/nccl/nccl_ops.h
+++ b/paddle/operators/nccl/nccl_ops.h
@@ -7,6 +7,8 @@
 namespace paddle {
 namespace operators {
 
+using framework::Tensor;
+
 template <typename Type>
 class NCCLTypeWrapper;
 
@@ -21,7 +23,7 @@ class NCCLTypeWrapper<double> {
 };
 
 template <typename T>
-class NCCLAllReduceKernel : public framework::OpKernel {
+class NCCLAllReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto ins = ctx.MultiInput<Tensor>("X");
@@ -35,13 +37,14 @@ class NCCLAllReduceKernel : public framework::OpKernel {
       op_type = ncclProd;
     } else if (reduction == "ncclMin") {
       op_type = ncclMin;
-    } else
-      (reduction == "ncclMax") { op_type = ncclMax; }
+    } else if (reduction == "ncclMax") {
+      op_type = ncclMax;
+    }
 
     auto dev_ctx =
         static_cast<const platform::CUDADeviceContext>(ctx.device_context());
 
-    NCCLManager* m = NCCLManager::Get();
+    platform::NCCLManager* m = platform::NCCLManager::Get();
 
     auto* comm = m->GetCommunicator(gpus);
     comm->wg_.Add(1);

From 23cb8259c3e5504eff0fb0a3d5d23947e370de99 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Mon, 16 Oct 2017 11:09:57 -0700
Subject: [PATCH 12/81] "add python test case"

---
 paddle/operators/nccl/nccl_gpu_common.cc |  2 +-
 paddle/operators/nccl/nccl_gpu_common.h  | 12 +---
 paddle/operators/nccl/nccl_ops.cc        | 78 +++++++++++-------------
 paddle/operators/nccl/nccl_ops.cu        | 16 +++++
 paddle/operators/nccl/nccl_ops.h         | 29 ++++++---
 5 files changed, 74 insertions(+), 63 deletions(-)
 create mode 100644 paddle/operators/nccl/nccl_ops.cu

diff --git a/paddle/operators/nccl/nccl_gpu_common.cc b/paddle/operators/nccl/nccl_gpu_common.cc
index 80cb66300e..934f79f245 100644
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
@@ -18,7 +18,7 @@ NCCLManager::~NCCLManager() {
       int idx = gid % gpus_.size();
       // wait finish
       PADDLE_ENFORCE(
-          cudaStreamWaitEvent(*comm->streams_[idx], comm->events_[idx], 0));
+          cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
 
       PADDLE_ENFORCE(cudaEventDestroy(comm->events_[idx]));
 
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 4a375fcc36..5ca6a9e05e 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -65,20 +65,10 @@ class WaitGroup {
   std::condition_variable cv_;
 };
 
-// class NCCLContext : public DeviceContext {
-// public:
-//   explicit NCCLContext(GPUPlace place);
-//   virtual ~NCCLContext();
-
-// private:
-//   std::vector<int> gpu_ids_;
-//   std::vector<cudaStream_t> streams_;
-// };
-
 // TODO(dzh) : make resources managed unified with framework
 struct Communicator {
   std::vector<ncclComm_t> comms_;
-  std::vector<cudaStream_t*> streams_;
+  std::vector<cudaStream_t> streams_;
   std::vector<cudaEvent_t> events_;
   std::vector<int> gpus_;
   WaitGroup wg_;
diff --git a/paddle/operators/nccl/nccl_ops.cc b/paddle/operators/nccl/nccl_ops.cc
index ccb22f3052..f1a83c1e1e 100644
--- a/paddle/operators/nccl/nccl_ops.cc
+++ b/paddle/operators/nccl/nccl_ops.cc
@@ -1,3 +1,14 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/operators/nccl/nccl_ops.h"
 
 namespace paddle {
@@ -9,54 +20,27 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  // allreduce do nothing in infershape
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("X"),
-        " Input(X) of AllReduce op input should not be NULL");
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    PADDLE_ENFORCE(ins.size() == outs.size(),
-                   "Input(X) and Output(Out) must have same size");
-    for (size_t i = 0; i < ins.size(); ++i) {
-      outs[i]->Resize(ins[i]->dims());
-    }
-    std::string reduction = ctx.Attr<std::string>("reduction");
-    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
-                    reduction == "ncclMin" || reduction == "ncclMax"),
-                   "invalid reduction!");
-  }
-};
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of AllReduce op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Input(X) of AllReduce op input should not be NULL");
 
-// BcastSendOp
-template <typename T>
-class NCCLBcastSendOp final : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("X"),
-        " Input(X) of BcastSend op input should not be NULL");
-  }
-};
+    auto x_dims = ctx->GetInputsDim("X");
 
-// BcastRecvOp
-template <typename T>
-class NCCLBcastRecvOp final : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
+    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+                    reduction == "ncclMin" || reduction == "ncclMax"),
+                   "invalid reduction.");
 
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
-        " Input(X) of BcastRecv op input should not be NULL");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
+// AllreduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
   NCCLAllReduceOpMaker(framework::OpProto *proto,
                        framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
@@ -71,7 +55,9 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+// BcastSendOp
 class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
   NCCLAllReduceOpMaker(framework::OpProto *proto,
                        framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
@@ -82,7 +68,9 @@ class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+// BcastRecvOp
 class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
   NCCLAllReduceOpMaker(framework::OpProto *proto,
                        framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
@@ -93,5 +81,9 @@ class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-}  // operators
-}  // paddle
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
+                             ops::NCCLAllReduceOpMaker);
diff --git a/paddle/operators/nccl/nccl_ops.cu b/paddle/operators/nccl/nccl_ops.cu
new file mode 100644
index 0000000000..eabe5f1729
--- /dev/null
+++ b/paddle/operators/nccl/nccl_ops.cu
@@ -0,0 +1,16 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/nccl/nccl_ops.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
\ No newline at end of file
diff --git a/paddle/operators/nccl/nccl_ops.h b/paddle/operators/nccl/nccl_ops.h
index f56b89d2ad..c46fdd7d44 100644
--- a/paddle/operators/nccl/nccl_ops.h
+++ b/paddle/operators/nccl/nccl_ops.h
@@ -1,3 +1,14 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
@@ -14,11 +25,13 @@ class NCCLTypeWrapper;
 
 template <>
 class NCCLTypeWrapper<float> {
+ public:
   static const ncclDataType_t type = ncclFloat;
 };
 
 template <>
 class NCCLTypeWrapper<double> {
+ public:
   static const ncclDataType_t type = ncclDouble;
 };
 
@@ -49,10 +62,10 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     auto* comm = m->GetCommunicator(gpus);
     comm->wg_.Add(1);
 
-    auto* stream = &dev_ctx.stream();
+    auto stream = dev_ctx.stream();
 
     // device id
-    int gid = ctx.GetPlace().GetDeviceId();
+    int gid = static_cast<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = gid % gpus.size();
     comm->streams_[idx] = stream;
 
@@ -60,9 +73,8 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE(
           ncclAllReduce(ins[i]->data<T>(), outs[i]->mutable_data<T>(),
                         outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type,
-                        op_type, &comm->comms_[idx], comm->streams_[idx]));
-      PADDLE_ENFORCE(
-          cudaEventRecord(comm->events_[idx], *comms_->streams_[idx]));
+                        op_type, comm->comms_[idx], comm->streams_[idx]));
+      PADDLE_ENFORCE(cudaEventRecord(comm->events_[idx], comm->streams_[idx]));
 
       // wait finish
       PADDLE_ENFORCE(
@@ -71,8 +83,9 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
 
     comm->wg_.Done();
 
-    wg.Wait();
+    comm->wg_.Wait();
   }
 };
-}
-}
+
+}  // namespace operators
+}  // namespace paddle

From 23701ffaf07840013295bb2ec14a484e263cdab9 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 18 Oct 2017 11:32:55 +0800
Subject: [PATCH 13/81] Refine op

---
 paddle/operators/seq_expand_op.h              | 119 +++++++++++-----
 python/paddle/v2/framework/tests/op_test.py   |   4 +-
 .../v2/framework/tests/test_seq_expand.py     | 128 +++++++++++++-----
 3 files changed, 185 insertions(+), 66 deletions(-)

diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index 0c399fe196..cd1182c4f0 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -14,14 +14,62 @@
 
 #pragma once
 
-#include "hl_cuda.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
 
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 
+template <typename T>
+using vector = framework::Vector<T>;
+
+vector<size_t> repeat_lod(vector<size_t> data, vector<size_t> starts,
+                          vector<size_t> times, bool is_first) {
+  vector<size_t> result;
+  result.push_back(data[0]);
+  size_t p = 0, start = 0, end = 0;
+  if (is_first == true) {
+    for (size_t i = 0; i < times.size(); ++i) {
+      result.push_back(data.back() + times[i] * (data[i + 1] - data[i]));
+    }
+  } else {
+    for (size_t i = 0; i < times.size(); ++i) {
+      while (starts[i] != data[p] && p < data.size()) {
+        ++p;
+      }
+      start = p;
+      while (starts[i + 1] != data[p] && p < data.size()) {
+        ++p;
+      }
+      end = p + 1;
+      for (size_t j = 0; j < times[i]; ++j) {
+        for (size_t index = start; index < end - 1; ++index) {
+          result.push_back(result.back() + data[index + 1] - data[index]);
+        }
+      }
+    }
+  }
+  return result;
+}
+
+template <typename Place, typename T>
+void repeat_data(const T* src, T* dst, size_t size, vector<size_t> starts,
+                 vector<size_t> times, Place place) {
+  const T* src_p = src;
+  T* dst_p = dst;
+  size_t count = 0;
+  for (size_t i = 0; i < times.size(); ++i) {
+    count = size * (starts[i + 1] - starts[i]);
+    for (size_t j = 0; j < times[i]; ++j) {
+      memory::Copy(place, dst_p, place, src_p, sizeof(T) * count);
+      dst_p += count;
+    }
+    src_p += count;
+  }
+}
+
 template <typename Place, typename T>
 class SeqExpandKernel : public framework::OpKernel<T> {
  public:
@@ -29,43 +77,52 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     auto* x = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
     const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    size_t repeat = static_cast<size_t>(context.Attr<int>("repeat"));
+    auto x_dims = x->dims();
+    auto x_lod = x->lod();
 
-    if (repeat != 0) {
-      if (x->lod().size() == 0) {
-        std::vector<size_t> level0;
-        for (size_t i = 0; i <= x->dims()[0]; i++) {
-          level0.push_back(i * repeat);
-        }
-        framework::LoD out_lod;
-        out_lod.push_back(level0);
-        out->set_lod(out_lod);
-      }
-    }
-    auto out_dim = out->dims();
-    size_t element_len = framework::product(out_dim) / out_dim[0];
-    std::vector<int> cpy_map(out_dim[0]);
-    if (x->lod().size() == 0) {
-      auto lod = out->lod();
-      for (int i = 0; i < lod.size() - 1; ++i) {
-        for (int j = lod[0][i]; i < lod[0][i + 1]; ++j) {
-          cpy_map[j] = i;
-        }
+    if (x_lod.size() == 0) {
+      vector<size_t> level;
+      for (int i = 0; i < x->dims()[0] + 1; ++i) {
+        level.push_back(i);
       }
+      x_lod.push_back(level);
+    } else {
+      x_lod.insert(x_lod.begin(), x_lod[0]);
     }
-    if (platform::is_cpu_place(context.GetPlace())) {
-      for (int i = 0; i < out_dim[0]; ++i) {
-        memcpy(out_data + element_len * i, x_data + element_len * cpy_map[i],
-               sizeof(T) * element_len);
+
+    size_t repeat = static_cast<size_t>(context.Attr<int>("repeat"));
+    vector<size_t> repeats;
+    if (repeat != 0) {
+      for (int i = 0; i < x_lod[0].size() - 1; ++i) {
+        repeats.push_back(repeat);
       }
+      std::vector<int64_t> dims = framework::vectorize(x->dims());
+      dims[0] = dims[0] * repeat;
+      auto out_dims = framework::make_ddim(dims);
+      out->Resize(out_dims);
     } else {
-      for (int i = 0; i < out_dim[0]; ++i) {
-        hl_memcpy(out_data + element_len * i,
-                  const_cast<T*>(x_data) + element_len * cpy_map[i],
-                  sizeof(T) * element_len);
+      auto* y = context.Input<LoDTensor>("Y");
+      auto y_lod = y->lod();
+      for (int i = 0; i < y_lod[0].size() - 1; ++i) {
+        repeats.push_back((y_lod[0][i + 1] - y_lod[0][i]) /
+                          (x_lod[0][i + 1] - x_lod[0][i]));
       }
+      out->Resize(x_dims);
     }
+
+    framework::LoD out_lod;
+    auto level0 = repeat_lod(x_lod[0], x_lod[0], repeats, true);
+    out_lod.push_back(level0);
+    for (int i = 1; i < x_lod.size(); ++i) {
+      out_lod.push_back(repeat_lod(x_lod[i], x_lod[0], repeats, false));
+    }
+
+    size_t element_len = framework::product(x_dims) / x_dims[0];
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    Place place = boost::get<Place>(context.GetPlace());
+    repeat_data<Place, T>(x_data, out_data, element_len, x_lod[0], repeats,
+                          place);
+    out->set_lod(out_lod);
   }
 };
 
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 81067f38bb..0b0de78caf 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -246,7 +246,9 @@ class OpTest(unittest.TestCase):
             else:
                 actual = np.array(self.scope.find_var(out_name).get_tensor())
                 expect = self.outputs[out_name]
-
+                print "out_name: %s" % out_name
+                print "actual: %s" % actual
+                print "expcept: %s" % expect
                 self.assertTrue(
                     np.allclose(
                         actual, expect, atol=atol),
diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/framework/tests/test_seq_expand.py
index 4608d3c3bd..854148a8f1 100644
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
@@ -3,59 +3,119 @@ import numpy as np
 from op_test import OpTest
 
 
+def repeat(list, starts, times, is_first):
+    newlist = [list[0]]
+    if is_first:
+        for i, time in enumerate(times):
+            size = list[i + 1] - list[i]
+            newlist.append(newlist[-1] + size * time)
+    else:
+        for i, time in enumerate(times):
+            start = list.index(starts[i])
+            end = list.index(starts[i + 1]) + 1
+            for t in range(time):
+                for index in range(start, end - 1):
+                    newlist.append(newlist[-1] + list[index + 1] - list[index])
+    return newlist
+
+
+def repeat_array(array, starts, times):
+    newlist = []
+    for i, time in enumerate(times):
+        for t in range(time):
+            newlist.extend(array[starts[i]:starts[i + 1]])
+    return newlist
+
+
 class TestSeqExpand(OpTest):
-    #class TestSeqExpand():
     def set_data(self):
         self.op_type = 'seq_expand'
         x = np.random.uniform(0.1, 1, [3, 2, 2]).astype('float32')
         y = np.zeros((6, 2, 2)).astype('float32')
-        lod = [[0, 2, 3, 6]]
-        print "x = %s" % x
-        self.inputs = {'X': x, 'Y': (y, lod)}
-        self.repeat = None
+        y_lod = [[0, 2, 3, 6]]
+        self.inputs = {'X': (x, None), 'Y': (y, y_lod)}
+        self.repeat = 2
 
     def compute(self):
-        x = self.inputs['X']
-        cpy_map = {}
-        lod = []
-        out_shape = []
+        x_data, x_lod = self.inputs['X']
+        print "x_data: %s" % x_data
+        print "x_lod: %s" % x_lod
+        if not x_lod:
+            x_lod = [[i for i in range(1 + x_data.shape[0])]]
+        else:
+            x_lod = [x_lod[0]] + x_lod
         if self.repeat:
-            level0 = []
-            for i in range(x.shape[0] + 1):
-                level0.append(i * self.repeat)
-            lod.append(level0)
-
-            for i in x.shape:
-                out_shape.append(i)
-            out_shape[0] = out_shape[0] * self.repeat
+            self.attrs = {'repeat': self.repeat}
+            repeats = (len(x_lod[0]) - 1) * [self.repeat]
+            # get out shape
+            # out_shape = np.copy(x_data.shape)
+            # out_shape[0] = out_shape[0] * self.repeat
         else:
-            y, lod = self.inputs['Y']
-            out_shape = y.shape
-        out = np.zeros(out_shape).astype('float32')
+            y_data, y_lod = self.inputs['Y']
+            print "y_lod: %s" % y_lod
+            #print "y_lod: %s" % y_lod
+            # get repeats
+            repeats = [((y_lod[0][i + 1] - y_lod[0][i]) /
+                        (x_lod[0][i + 1] - x_lod[0][i]))
+                       for i in range(len(y_lod[0]) - 1)]
+            # get out shape
+            # out_shape = y_data.shape
+        # get out lod
 
-        start = 0
-
-        for i in range(len(lod[0]) - 1):
-            for j in range(lod[0][i], lod[0][i + 1]):
-                cpy_map[j] = i
-        print "cpy_map = %s" % cpy_map
-        for i in range(len(out)):
-            out[i] = x[cpy_map[i]]
-
-        print "out = %s" % out
-        self.outputs = {'Out': (out, lod)}
+        out_lod = [repeat(x_lod[0], x_lod[0], repeats, True)] + [
+            repeat(lod, x_lod[0], repeats, False) for lod in x_lod[1:]
+        ]
+        # copy data
+        out = repeat_array(x_data.tolist(), x_lod[0], repeats)
+        self.outputs = {'Out': (out, out_lod)}
+        print "outputs: %s" % self.outputs
 
     def setUp(self):
+        self.op_type = 'seq_expand'
         self.set_data()
         self.compute()
 
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+
+#    def test_check_grad(self):
+#        self.check_grad(["X"], "Out")
+
+
+class TestSeqExpandCase1(TestSeqExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [7, 1]).astype('float32')
+        x_lod = [[0, 5, 7], [0, 2, 5, 7]]
+        self.inputs = {'X': (x_data, x_lod)}
+        self.repeat = 2
+
+
+class TestSeqExpandCase2(TestSeqExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
+        self.inputs = {'X': (x_data, None)}
+        self.repeat = 2
+
+
+class TestSeqExpandCase3(TestSeqExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        y_lod = [[0, 1, 4, 8]]
+        self.inputs = {'X': (x_data, None), 'Y': (y_data, y_lod)}
+        self.repeat = None
+
+
+class TestSeqExpandCase4(TestSeqExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+        x_lod = [[0, 2, 5]]
+        y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
+        y_lod = [[0, 4, 13], [0, 2, 4, 7, 10, 13]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+        self.repeat = None
 
 
 if __name__ == '__main__':
     unittest.main()
-#    TestSeqExpand().setUp()

From 8de04be786fe21a72b9be91dab963f5d7520885b Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 18 Oct 2017 17:14:38 +0800
Subject: [PATCH 14/81] Fix unitest

---
 paddle/framework/lod_tensor.cc                | 29 +++++++
 paddle/framework/lod_tensor.h                 |  7 ++
 paddle/operators/seq_expand_op.h              | 79 +++++--------------
 .../v2/framework/tests/test_seq_expand.py     | 30 ++-----
 4 files changed, 64 insertions(+), 81 deletions(-)

diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 5b7badf89c..1247daafc5 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -103,5 +103,34 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   lod_ = new_lod;
 }
 
+Vector<size_t> repeat_lod(Vector<size_t> data, Vector<size_t> starts,
+                          Vector<size_t> times, bool is_first) {
+  Vector<size_t> result;
+  result.push_back(data[0]);
+  size_t p = 0, start = 0, end = 0;
+  if (is_first == true) {
+    for (size_t i = 0; i < times.size(); ++i) {
+      result.push_back(data.back() + times[i] * (data[i + 1] - data[i]));
+    }
+  } else {
+    for (size_t i = 0; i < times.size(); ++i) {
+      while (starts[i] != data[p] && p < data.size()) {
+        ++p;
+      }
+      start = p;
+      while (starts[i + 1] != data[p] && p < data.size()) {
+        ++p;
+      }
+      end = p + 1;
+      for (size_t j = 0; j < times[i]; ++j) {
+        for (size_t index = start; index < end - 1; ++index) {
+          result.push_back(result.back() + data[index + 1] - data[index]);
+        }
+      }
+    }
+  }
+  return result;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 4db36ee766..41c83a1164 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -15,6 +15,9 @@
 #pragma once
 
 #include <memory>
+#include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
@@ -122,5 +125,9 @@ class LoDTensor : public Tensor {
  private:
   LoD lod_;
 };
+
+Vector<size_t> repeat_lod(Vector<size_t> data, Vector<size_t> starts,
+                          Vector<size_t> times, bool is_first);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index cd1182c4f0..221393f909 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -22,54 +22,6 @@ namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 
-template <typename T>
-using vector = framework::Vector<T>;
-
-vector<size_t> repeat_lod(vector<size_t> data, vector<size_t> starts,
-                          vector<size_t> times, bool is_first) {
-  vector<size_t> result;
-  result.push_back(data[0]);
-  size_t p = 0, start = 0, end = 0;
-  if (is_first == true) {
-    for (size_t i = 0; i < times.size(); ++i) {
-      result.push_back(data.back() + times[i] * (data[i + 1] - data[i]));
-    }
-  } else {
-    for (size_t i = 0; i < times.size(); ++i) {
-      while (starts[i] != data[p] && p < data.size()) {
-        ++p;
-      }
-      start = p;
-      while (starts[i + 1] != data[p] && p < data.size()) {
-        ++p;
-      }
-      end = p + 1;
-      for (size_t j = 0; j < times[i]; ++j) {
-        for (size_t index = start; index < end - 1; ++index) {
-          result.push_back(result.back() + data[index + 1] - data[index]);
-        }
-      }
-    }
-  }
-  return result;
-}
-
-template <typename Place, typename T>
-void repeat_data(const T* src, T* dst, size_t size, vector<size_t> starts,
-                 vector<size_t> times, Place place) {
-  const T* src_p = src;
-  T* dst_p = dst;
-  size_t count = 0;
-  for (size_t i = 0; i < times.size(); ++i) {
-    count = size * (starts[i + 1] - starts[i]);
-    for (size_t j = 0; j < times[i]; ++j) {
-      memory::Copy(place, dst_p, place, src_p, sizeof(T) * count);
-      dst_p += count;
-    }
-    src_p += count;
-  }
-}
-
 template <typename Place, typename T>
 class SeqExpandKernel : public framework::OpKernel<T> {
  public:
@@ -81,7 +33,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     auto x_lod = x->lod();
 
     if (x_lod.size() == 0) {
-      vector<size_t> level;
+      framework::Vector<size_t> level;
       for (int i = 0; i < x->dims()[0] + 1; ++i) {
         level.push_back(i);
       }
@@ -91,7 +43,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     }
 
     size_t repeat = static_cast<size_t>(context.Attr<int>("repeat"));
-    vector<size_t> repeats;
+    framework::Vector<size_t> repeats;
     if (repeat != 0) {
       for (int i = 0; i < x_lod[0].size() - 1; ++i) {
         repeats.push_back(repeat);
@@ -107,21 +59,32 @@ class SeqExpandKernel : public framework::OpKernel<T> {
         repeats.push_back((y_lod[0][i + 1] - y_lod[0][i]) /
                           (x_lod[0][i + 1] - x_lod[0][i]));
       }
-      out->Resize(x_dims);
+      out->Resize(y->dims());
     }
 
     framework::LoD out_lod;
-    auto level0 = repeat_lod(x_lod[0], x_lod[0], repeats, true);
+    auto level0 = framework::repeat_lod(x_lod[0], x_lod[0], repeats, true);
     out_lod.push_back(level0);
     for (int i = 1; i < x_lod.size(); ++i) {
-      out_lod.push_back(repeat_lod(x_lod[i], x_lod[0], repeats, false));
+      out_lod.push_back(
+          framework::repeat_lod(x_lod[i], x_lod[0], repeats, false));
     }
 
     size_t element_len = framework::product(x_dims) / x_dims[0];
     T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    // copy data
     Place place = boost::get<Place>(context.GetPlace());
-    repeat_data<Place, T>(x_data, out_data, element_len, x_lod[0], repeats,
-                          place);
+    size_t count = 0;
+    for (size_t i = 0; i < repeats.size(); ++i) {
+      count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
+      for (size_t j = 0; j < repeats[i]; ++j) {
+        memory::Copy(place, out_data, place, x_data, sizeof(T) * count);
+        out_data += count;
+      }
+      x_data += count;
+    }
+
     out->set_lod(out_lod);
   }
 };
@@ -130,9 +93,9 @@ template <typename Place, typename T>
 class SeqExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    // auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    // auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-    // d_x->mutable_data<T>(context.GetPlace());
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    d_x->mutable_data<T>(context.GetPlace());
   }
 };
 
diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/framework/tests/test_seq_expand.py
index 854148a8f1..2b9509413e 100644
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
@@ -29,17 +29,13 @@ def repeat_array(array, starts, times):
 
 class TestSeqExpand(OpTest):
     def set_data(self):
-        self.op_type = 'seq_expand'
-        x = np.random.uniform(0.1, 1, [3, 2, 2]).astype('float32')
-        y = np.zeros((6, 2, 2)).astype('float32')
-        y_lod = [[0, 2, 3, 6]]
-        self.inputs = {'X': (x, None), 'Y': (y, y_lod)}
+        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
+        self.inputs = {'X': x_data}
         self.repeat = 2
 
     def compute(self):
-        x_data, x_lod = self.inputs['X']
-        print "x_data: %s" % x_data
-        print "x_lod: %s" % x_lod
+        x = self.inputs['X']
+        x_data, x_lod = x if type(x) == tuple else (x, None)
         if not x_lod:
             x_lod = [[i for i in range(1 + x_data.shape[0])]]
         else:
@@ -47,28 +43,16 @@ class TestSeqExpand(OpTest):
         if self.repeat:
             self.attrs = {'repeat': self.repeat}
             repeats = (len(x_lod[0]) - 1) * [self.repeat]
-            # get out shape
-            # out_shape = np.copy(x_data.shape)
-            # out_shape[0] = out_shape[0] * self.repeat
         else:
             y_data, y_lod = self.inputs['Y']
-            print "y_lod: %s" % y_lod
-            #print "y_lod: %s" % y_lod
-            # get repeats
             repeats = [((y_lod[0][i + 1] - y_lod[0][i]) /
                         (x_lod[0][i + 1] - x_lod[0][i]))
                        for i in range(len(y_lod[0]) - 1)]
-            # get out shape
-            # out_shape = y_data.shape
-        # get out lod
-
         out_lod = [repeat(x_lod[0], x_lod[0], repeats, True)] + [
             repeat(lod, x_lod[0], repeats, False) for lod in x_lod[1:]
         ]
-        # copy data
         out = repeat_array(x_data.tolist(), x_lod[0], repeats)
-        self.outputs = {'Out': (out, out_lod)}
-        print "outputs: %s" % self.outputs
+        self.outputs = {'Out': out}
 
     def setUp(self):
         self.op_type = 'seq_expand'
@@ -94,7 +78,7 @@ class TestSeqExpandCase1(TestSeqExpand):
 class TestSeqExpandCase2(TestSeqExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        self.inputs = {'X': (x_data, None)}
+        self.inputs = {'X': x_data}
         self.repeat = 2
 
 
@@ -103,7 +87,7 @@ class TestSeqExpandCase3(TestSeqExpand):
         x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
         y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
         y_lod = [[0, 1, 4, 8]]
-        self.inputs = {'X': (x_data, None), 'Y': (y_data, y_lod)}
+        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
         self.repeat = None
 
 

From 31531ab581f7d726d410c2181ac79ed41a32b3ef Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 19 Oct 2017 01:18:20 +0800
Subject: [PATCH 15/81] Add backward kernel

---
 paddle/framework/lod_tensor.cc                |  2 +-
 paddle/operators/seq_expand_op.cc             | 30 +++++--------------
 paddle/operators/seq_expand_op.h              | 27 +++++++++++++++--
 paddle/operators/sequence_concat_op.cc        | 10 +++----
 python/paddle/v2/framework/tests/op_test.py   |  3 --
 .../v2/framework/tests/test_seq_expand.py     |  5 ++--
 6 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 1247daafc5..e4a2f5765a 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -110,7 +110,7 @@ Vector<size_t> repeat_lod(Vector<size_t> data, Vector<size_t> starts,
   size_t p = 0, start = 0, end = 0;
   if (is_first == true) {
     for (size_t i = 0; i < times.size(); ++i) {
-      result.push_back(data.back() + times[i] * (data[i + 1] - data[i]));
+      result.push_back(result.back() + times[i] * (data[i + 1] - data[i]));
     }
   } else {
     for (size_t i = 0; i < times.size(); ++i) {
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index 63b17a10f5..59d7135489 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -60,7 +60,8 @@ As an example:
 
 Given:
 
-X = [1, 2 , 3]
+X.data = [1, 2 , 3, 4]
+X.lod = [[0, 3, 4], [0, 1, 3, 4]]
 
 and
 
@@ -69,8 +70,8 @@ repeat = 2
 
 then we get
 
-Out.data = [1, 1, 2, 2, 3, 3]
-Out.lod = [[0, 2, 4, 6]]
+Out.data = [1, 2, 3, 1, 2, 3, 4, 4]
+Out.lod = [[0, 6, 8], [0, 3, 6, 7, 8], [0, 1, 3, 4, 6, 7, 8]]
 
 )DOC");
   }
@@ -83,6 +84,7 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
     auto x_dims = ctx->GetInputDim("X");
@@ -93,30 +95,12 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-class SeqExpandOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* bind = new framework::OpDescBind();
-    bind->SetInput("X", Input("X"));
-    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    bind->SetAttrMap(Attrs());
-    bind->SetType("seq_expand_grad");
-    return std::unique_ptr<framework::OpDescBind>(bind);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker,
-                  ops::SeqExpandOpGradMaker);
-REGISTER_OPERATOR(seq_expand_grad, ops::SeqExpandOpGrad);
+REGISTER_OP(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker,
+            seq_expand_grad, ops::SeqExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(seq_expand,
                        ops::SeqExpandKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index 221393f909..8b7bda54c0 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -16,6 +16,7 @@
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace operators {
@@ -93,9 +94,29 @@ template <typename Place, typename T>
 class SeqExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-    d_x->mutable_data<T>(context.GetPlace());
+    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* x = context.Input<LoDTensor>("X");
+    auto* out = context.Input<LoDTensor>("Out");
+    auto out_lod = out->lod();
+    d_x->set_lod(x->lod());
+    const T* d_out_data = d_out->data<T>();
+    auto d_out_dims = d_out->dims();
+    T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
+    size_t element_len = framework::product(d_out_dims) / d_out_dims[0];
+    for (size_t i = 0; i < out->NumElements(); ++i) {
+      size_t ele_count = out_lod[0][i + 1] - out_lod[0][i];
+      size_t repeat = out->NumElements(0, i);
+      Eigen::TensorMap<Eigen::Tensor<const T, 2>> d_out_t(
+          d_out_data, static_cast<int>(repeat),
+          static_cast<int>((ele_count * element_len) / repeat));
+      Eigen::TensorMap<Eigen::Tensor<T, 1>> d_x_t(
+          d_x_data, static_cast<int>((ele_count * element_len) / repeat));
+      auto place = context.GetEigenDevice<Place>();
+      d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({0}));
+      d_out_data += (ele_count * element_len);
+      d_x_data += ((ele_count * element_len) / repeat);
+    }
   }
 };
 
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index 1fce96cdfe..46f73e3c27 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -68,12 +68,12 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The level should be less than the level number of inputs.")
         .SetDefault(0);
     AddComment(R"DOC(
-    The sequence_concat operator concatenates multiple LoDTensors. 
-    It only supports sequence (LoD Tensor with level number is 1) 
+    The sequence_concat operator concatenates multiple LoDTensors.
+    It only supports sequence (LoD Tensor with level number is 1)
     or a nested sequence (LoD tensor with level number is 2) as its input.
     - Case1:
       If the axis is other than 0(here, axis is 1 and level is 1),
-      each input should have the same LoD information and the LoD 
+      each input should have the same LoD information and the LoD
       information of the output keeps the same as the input.
 
       LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
@@ -81,7 +81,7 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
       LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
 
     - Case2:
-      If the axis is 0(here, leve is 0), the inputs are concatenated along 
+      If the axis is 0(here, leve is 0), the inputs are concatenated along
       time steps, the LoD information of the output need to re-compute.
 
       LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
@@ -94,7 +94,7 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
       LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
       LoD(x1) = {{0,3,5}, {0,1,3,4,5}}; Dims(x1) = (5,3,4)
       LoD(Out) = {{0,5,9}, {0,2,5,7,9}}; Dims(Out) = (9,3,4)
-      
+
     NOTE: The levels of all the inputs should be the same.
     )DOC");
   }
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 3ef8ec3164..a88e9f0bb8 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -246,9 +246,6 @@ class OpTest(unittest.TestCase):
             else:
                 actual = np.array(self.scope.find_var(out_name).get_tensor())
                 expect = self.outputs[out_name]
-                print "out_name: %s" % out_name
-                print "actual: %s" % actual
-                print "expcept: %s" % expect
                 self.assertTrue(
                     np.allclose(
                         actual, expect, atol=atol),
diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/framework/tests/test_seq_expand.py
index 2b9509413e..87e39d72bf 100644
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
@@ -62,9 +62,8 @@ class TestSeqExpand(OpTest):
     def test_check_output(self):
         self.check_output()
 
-
-#    def test_check_grad(self):
-#        self.check_grad(["X"], "Out")
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
 
 
 class TestSeqExpandCase1(TestSeqExpand):

From fdfc8f9baaa5648f5d85ec17506cedc07b6f9cd2 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 18 Oct 2017 18:19:09 -0700
Subject: [PATCH 16/81] "switch to Init op"

---
 paddle/operators/nccl/nccl_gpu_common.h | 17 +++++-
 paddle/operators/nccl/nccl_ops.cc       | 80 +++++++++++++++++--------
 paddle/operators/nccl/nccl_ops.h        | 28 ++++++---
 3 files changed, 91 insertions(+), 34 deletions(-)

diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 5ca6a9e05e..d10688b127 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -79,7 +79,22 @@ struct Communicator {
     streams_.resize(gpus.size());
     events_.resize(gpus.size());
   }
-  // Communicator(int num_device): comms_.resize(num_device) {}
+
+  ~Communicator() {
+    for (size_t i = 0; i < gpus_.size(); ++i) {
+      int gid = gpus_[i];
+      platform::SetDeviceId(gid);
+
+      int idx = gid % gpus_.size();
+      // wait finish
+      PADDLE_ENFORCE(
+          cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
+
+      PADDLE_ENFORCE(cudaEventDestroy(comm->events_[idx]));
+
+      PADDLE_ENFORCE(ncclCommDestroy(comm->comms_[idx]));
+    }
+  }
 
   inline int get_root_gpu() const { return root_gpu; }
 
diff --git a/paddle/operators/nccl/nccl_ops.cc b/paddle/operators/nccl/nccl_ops.cc
index f1a83c1e1e..5cad44dc9f 100644
--- a/paddle/operators/nccl/nccl_ops.cc
+++ b/paddle/operators/nccl/nccl_ops.cc
@@ -14,7 +14,33 @@
 namespace paddle {
 namespace operators {
 
-// AllreduceOp
+// NCCLinitOp
+class NCCLInitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Communicator"),
+                   " Input(X) of AllReduce op input should not be NULL");
+  }
+};
+
+class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLInitOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("gpus", "gpu id lists");
+    AddOutput("Communicator",
+              "Create Communicator for communicating between gpus");
+    AddComment(R"DOC(
+               create communicator.
+        )DOC");
+  }
+};
+
+// AllReduceOp
 class NCCLAllReduceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -23,6 +49,9 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    " Input(X) of AllReduce op input should not be NULL");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Communicator"),
+        " Input(Communicator) of AllReduce op input should not be NULL");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    " Input(X) of AllReduce op input should not be NULL");
 
@@ -45,6 +74,7 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
                        framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of AllReduce op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
     AddAttr<std::string>("reduction",
                          "{'ncclmin', 'ncclmax', 'ncclprod', 'ncclsum'}.");
@@ -55,31 +85,31 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-// BcastSendOp
-class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NCCLAllReduceOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of BcastSend op");
-    AddComment(R"DOC(
-            BcastSend the tensors.
-        )DOC");
-  }
-};
+// // BcastSendOp
+// class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
+//  public:
+//   NCCLAllReduceOpMaker(framework::OpProto *proto,
+//                        framework::OpAttrChecker *op_checker)
+//       : OpProtoAndCheckerMaker(proto, op_checker) {
+//     AddInput("X", "The input of BcastSend op");
+//     AddComment(R"DOC(
+//             BcastSend the tensors.
+//         )DOC");
+//   }
+// };
 
-// BcastRecvOp
-class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NCCLAllReduceOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "The output of BcastRecv op");
-    AddComment(R"DOC(
-            BcastRecv the tensors.
-        )DOC");
-  }
-};
+// // BcastRecvOp
+// class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+//  public:
+//   NCCLAllReduceOpMaker(framework::OpProto *proto,
+//                        framework::OpAttrChecker *op_checker)
+//       : OpProtoAndCheckerMaker(proto, op_checker) {
+//     AddOutput("Out", "The output of BcastRecv op");
+//     AddComment(R"DOC(
+//             BcastRecv the tensors.
+//         )DOC");
+//   }
+// };
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_ops.h b/paddle/operators/nccl/nccl_ops.h
index c46fdd7d44..a7a74a0e41 100644
--- a/paddle/operators/nccl/nccl_ops.h
+++ b/paddle/operators/nccl/nccl_ops.h
@@ -35,6 +35,16 @@ class NCCLTypeWrapper<double> {
   static const ncclDataType_t type = ncclDouble;
 };
 
+class NCCLInitOp : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto gpus = ctx.Input<std::vector<int>>("gpus");
+    auto* comm = ctx.Output<Communicator>("Communicator");
+    comm->mutable_data<Communicator>(CPUPlace());
+    comm = NCCLManager::GetCommunicator(gpus);
+  }
+};
+
 template <typename T>
 class NCCLAllReduceKernel : public framework::OpKernel<T> {
  public:
@@ -54,13 +64,15 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
       op_type = ncclMax;
     }
 
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
     auto dev_ctx =
         static_cast<const platform::CUDADeviceContext>(ctx.device_context());
 
-    platform::NCCLManager* m = platform::NCCLManager::Get();
+    // platform::NCCLManager* m = platform::NCCLManager::Get();
 
-    auto* comm = m->GetCommunicator(gpus);
-    comm->wg_.Add(1);
+    // auto* comm = m->GetCommunicator(gpus);
+    // comm->wg_.Add(1);
 
     auto stream = dev_ctx.stream();
 
@@ -76,14 +88,14 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
                         op_type, comm->comms_[idx], comm->streams_[idx]));
       PADDLE_ENFORCE(cudaEventRecord(comm->events_[idx], comm->streams_[idx]));
 
-      // wait finish
-      PADDLE_ENFORCE(
-          cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
+      // // wait finish
+      // PADDLE_ENFORCE(
+      //     cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
     }
 
-    comm->wg_.Done();
+    // comm->wg_.Done();
 
-    comm->wg_.Wait();
+    // comm->wg_.Wait();
   }
 };
 

From a94b3dd9a7422fdc02795e73e3e5b4168b0fff45 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 19 Oct 2017 16:59:43 +0800
Subject: [PATCH 17/81] Refine comments and function name 1. Add more comments
 and exmples 2. Rename repeat_lod to expand_lod 3. Remove unused head file

---
 paddle/framework/lod_tensor.cc    | 22 ++++-----
 paddle/framework/lod_tensor.h     |  7 +--
 paddle/operators/seq_expand_op.cc | 76 +++++++++++++++++++++++--------
 paddle/operators/seq_expand_op.h  | 18 ++++----
 4 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index e4a2f5765a..49d9e56689 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -103,28 +103,28 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   lod_ = new_lod;
 }
 
-Vector<size_t> repeat_lod(Vector<size_t> data, Vector<size_t> starts,
-                          Vector<size_t> times, bool is_first) {
+Vector<size_t> expand_lod(Vector<size_t> level, Vector<size_t> starts,
+                          Vector<size_t> scales, bool repeat) {
   Vector<size_t> result;
-  result.push_back(data[0]);
+  result.push_back(level[0]);
   size_t p = 0, start = 0, end = 0;
-  if (is_first == true) {
-    for (size_t i = 0; i < times.size(); ++i) {
-      result.push_back(result.back() + times[i] * (data[i + 1] - data[i]));
+  if (!repeat) {
+    for (size_t i = 0; i < scales.size(); ++i) {
+      result.push_back(result.back() + scales[i] * (level[i + 1] - level[i]));
     }
   } else {
-    for (size_t i = 0; i < times.size(); ++i) {
-      while (starts[i] != data[p] && p < data.size()) {
+    for (size_t i = 0; i < scales.size(); ++i) {
+      while (starts[i] != level[p] && p < level.size()) {
         ++p;
       }
       start = p;
-      while (starts[i + 1] != data[p] && p < data.size()) {
+      while (starts[i + 1] != level[p] && p < level.size()) {
         ++p;
       }
       end = p + 1;
-      for (size_t j = 0; j < times[i]; ++j) {
+      for (size_t j = 0; j < scales[i]; ++j) {
         for (size_t index = start; index < end - 1; ++index) {
-          result.push_back(result.back() + data[index + 1] - data[index]);
+          result.push_back(result.back() + level[index + 1] - level[index]);
         }
       }
     }
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 41c83a1164..c64ee94405 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -15,9 +15,6 @@
 #pragma once
 
 #include <memory>
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
@@ -126,8 +123,8 @@ class LoDTensor : public Tensor {
   LoD lod_;
 };
 
-Vector<size_t> repeat_lod(Vector<size_t> data, Vector<size_t> starts,
-                          Vector<size_t> times, bool is_first);
+Vector<size_t> expand_lod(Vector<size_t> level, Vector<size_t> starts,
+                          Vector<size_t> scales, bool repeat);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index 59d7135489..b9633721e2 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -50,28 +50,68 @@ class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
   SeqExpandOpMaker(framework::OpProto* proto,
                    framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    // TODO(wanghaoshuang): Add more comments
-    AddInput("X", "The input('X') of seq_expand op.");
-    AddInput("Y", "The reference input('Y') of seq_expand op.");
-    AddOutput("Out", "The output of seq_expand op.");
-    AddAttr<int>("repeat", "repeat times").SetDefault(0);
+    AddInput(
+        "X",
+        "The input('X') of seq_expand op. It can be LoDTensor or base Tensor.");
+    AddInput(
+        "Y",
+        "The reference input('Y') of seq_expand op."
+        "It must be a LoDTensor with k-level(k>0)."
+        "This reference input is essential if 'repeat' attribute is not "
+        "configured."
+        "Input(X) will be expanded by LoD of input(Y) while repeat ==  0.");
+    AddOutput("Out",
+              "The output of seq_expand op."
+              "The output is a (k+1)-level LoDTensor"
+              "while input(X) being k-level LoDTensor."
+              "(Given base tensor is 0-level LoDTensor.)");
+    AddAttr<int>("repeat",
+                 "(type:int; default value: 0)"
+                 "Repeatting times of each element while expanding input(X)."
+                 "It works while input(Y) is not configured.")
+        .SetDefault(0);
     AddComment(R"DOC(
-As an example:
+Expand k-level LoDTensor to (k+1)-level LoDTensor
+by lod of input(Y) or 'repeat' attribute.
 
-Given:
-
-X.data = [1, 2 , 3, 4]
-X.lod = [[0, 3, 4], [0, 1, 3, 4]]
+Case 1:
 
+Given a 2-level LoDTensor X:
+    X.data = [1, 2 , 3, 4]
+    X.lod = [[0, 3, 4], [0, 1, 3, 4]]
 and
-
-repeat = 2
-
-
-then we get
-
-Out.data = [1, 2, 3, 1, 2, 3, 4, 4]
-Out.lod = [[0, 6, 8], [0, 3, 6, 7, 8], [0, 1, 3, 4, 6, 7, 8]]
+    repeat = 2
+then we get 3-level LoDTensor
+    Out.data = [1, 2, 3, 1, 2, 3, 4, 4]
+    Out.lod = [[0, 6, 8],
+               [0, 3, 6, 7, 8],
+               [0, 1, 3, 4, 6, 7, 8]]
+
+Case 2:
+
+Given 2-level a LoDTensor X
+    X.data = [1, 2, 3, 4]
+    X.lod = [[0, 3, 4], [0, 1, 3, 4]]
+and
+    Y.lod = [[0, 6, 8],
+             [0, 3, 6, 7, 8],
+             [0,1,3,4,6,7,8]]
+then we get 3-level LoDTensor
+    Out.data = [1, 2, 3, 1, 2, 3, 4, 4]
+    Out.lod = [[0, 6, 8],
+               [0, 3, 6, 7, 8],
+               [0, 1, 3, 4, 6, 7, 8]]
+
+Case 3:
+
+Given a 0-level LoDTensor X
+    X.data = [1, 2, 3, 4]
+    X.lod = NULL
+and
+    repeat = 2
+then we get 1-level LoDTensor
+    Out.data = [1, 1, 2, 2, 3, 3, 4, 4]
+    Out.lod = [[0, 2, 4, 6, 8]]
 
 )DOC");
   }
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index 8b7bda54c0..e990f12512 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -44,10 +44,10 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     }
 
     size_t repeat = static_cast<size_t>(context.Attr<int>("repeat"));
-    framework::Vector<size_t> repeats;
+    framework::Vector<size_t> scales;
     if (repeat != 0) {
       for (int i = 0; i < x_lod[0].size() - 1; ++i) {
-        repeats.push_back(repeat);
+        scales.push_back(repeat);
       }
       std::vector<int64_t> dims = framework::vectorize(x->dims());
       dims[0] = dims[0] * repeat;
@@ -57,18 +57,18 @@ class SeqExpandKernel : public framework::OpKernel<T> {
       auto* y = context.Input<LoDTensor>("Y");
       auto y_lod = y->lod();
       for (int i = 0; i < y_lod[0].size() - 1; ++i) {
-        repeats.push_back((y_lod[0][i + 1] - y_lod[0][i]) /
-                          (x_lod[0][i + 1] - x_lod[0][i]));
+        scales.push_back((y_lod[0][i + 1] - y_lod[0][i]) /
+                         (x_lod[0][i + 1] - x_lod[0][i]));
       }
       out->Resize(y->dims());
     }
 
     framework::LoD out_lod;
-    auto level0 = framework::repeat_lod(x_lod[0], x_lod[0], repeats, true);
+    auto level0 = framework::expand_lod(x_lod[0], x_lod[0], scales, false);
     out_lod.push_back(level0);
     for (int i = 1; i < x_lod.size(); ++i) {
       out_lod.push_back(
-          framework::repeat_lod(x_lod[i], x_lod[0], repeats, false));
+          framework::expand_lod(x_lod[i], x_lod[0], scales, true));
     }
 
     size_t element_len = framework::product(x_dims) / x_dims[0];
@@ -77,9 +77,9 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     // copy data
     Place place = boost::get<Place>(context.GetPlace());
     size_t count = 0;
-    for (size_t i = 0; i < repeats.size(); ++i) {
+    for (size_t i = 0; i < scales.size(); ++i) {
       count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
-      for (size_t j = 0; j < repeats[i]; ++j) {
+      for (size_t j = 0; j < scales[i]; ++j) {
         memory::Copy(place, out_data, place, x_data, sizeof(T) * count);
         out_data += count;
       }
@@ -95,9 +95,9 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
     auto* x = context.Input<LoDTensor>("X");
     auto* out = context.Input<LoDTensor>("Out");
+    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
     auto out_lod = out->lod();
     d_x->set_lod(x->lod());
     const T* d_out_data = d_out->data<T>();

From 333045d7b23d4f8befaed815086323bc33391505 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Thu, 19 Oct 2017 21:27:16 -0700
Subject: [PATCH 18/81] "move nccl to another directory"

---
 paddle/operators/CMakeLists.txt               |  16 ++-
 paddle/operators/nccl/CMakeLists.txt          |   8 +-
 paddle/operators/nccl/nccl_gpu_common.cc      |  68 ++----------
 paddle/operators/nccl/nccl_gpu_common.h       |  61 +++--------
 paddle/operators/nccl/nccl_ops.cu             |  16 ---
 paddle/operators/nccl/nccl_ops.h              | 103 ------------------
 .../{nccl/nccl_ops.cc => nccl_op.cc}          |  57 +++++-----
 paddle/operators/nccl_op.cu                   |  66 +++++++++++
 paddle/operators/nccl_op.h                    |  50 +++++++++
 .../v2/framework/tests/test_nccl_ops.py       |  36 ++++--
 10 files changed, 215 insertions(+), 266 deletions(-)
 delete mode 100644 paddle/operators/nccl/nccl_ops.cu
 delete mode 100644 paddle/operators/nccl/nccl_ops.h
 rename paddle/operators/{nccl/nccl_ops.cc => nccl_op.cc} (73%)
 create mode 100644 paddle/operators/nccl_op.cu
 create mode 100644 paddle/operators/nccl_op.h

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 4457101275..4faf9bbb08 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -76,6 +76,14 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
     endif()
     
+    # nccl_op contains several operators
+    if ("${TARGET}" STREQUAL "nccl_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(ncclInit);\n")
+        # file(APPEND ${pybind_file} "USE_OP(ncclInit);\n")
+    endif()
+    
     # reduce_op contains several operators
     if ("${TARGET}" STREQUAL "reduce_op")
         set(pybind_flag 1)
@@ -116,7 +124,9 @@ set(DEPS_OPS
     softmax_with_cross_entropy_op
     sum_op
     pool_op
-    pool_with_index_op)
+    pool_with_index_op
+    nccl_op
+    )
 
 
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
@@ -127,6 +137,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(sum_op DEPS net_op)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
+if(WITH_GPU)
+op_library(nccl_op DEPS nccl_common)
+endif()
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
@@ -134,6 +147,7 @@ foreach(src ${GENERAL_OPS})
 endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+message(STATUS "operators_list: ${OP_LIBRARY}")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/operators/nccl/CMakeLists.txt
index 05c27f08fe..bdd873b3f3 100644
--- a/paddle/operators/nccl/CMakeLists.txt
+++ b/paddle/operators/nccl/CMakeLists.txt
@@ -1,8 +1,4 @@
 if(WITH_GPU)
-  nv_library(nccl_common SRCS nccl_gpu_common DEPS device_context operator)
-  nv_library(nccl_op SRCS nccl_ops.cc DEPS nccl_common)
-else()
-  cc_library(nccl_common SRCS nccl_gpu_common DEPS device_context operator)
+  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator)
+  nv_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
 endif()
-
-cc_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
diff --git a/paddle/operators/nccl/nccl_gpu_common.cc b/paddle/operators/nccl/nccl_gpu_common.cc
index 934f79f245..6be735e4c7 100644
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
@@ -1,61 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/platform/gpu_info.h"
 
 namespace paddle {
-namespace platform {
-
-NCCLManager::NCCLManager() {}
-
-NCCLManager::~NCCLManager() {
-  for (auto& p : comm_table) {
-    auto& comm = p.second;
-    auto& gpus_ = comm->gpus_;
-    for (size_t i = 0; i < gpus_.size(); ++i) {
-      int gid = gpus_[i];
-      platform::SetDeviceId(gid);
-
-      // mapping gid to idx
-      int idx = gid % gpus_.size();
-      // wait finish
-      PADDLE_ENFORCE(
-          cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
-
-      PADDLE_ENFORCE(cudaEventDestroy(comm->events_[idx]));
-
-      PADDLE_ENFORCE(ncclCommDestroy(comm->comms_[idx]));
-    }
-    comm.reset(nullptr);
-  }
-}
-
-Communicator* NCCLManager::GetCommunicator(const std::vector<int>& gpus) {
-  std::string key;
-  for (auto& id : gpus) {
-    key += std::to_string(id);
-  }
-  std::sort(key.begin(), key.end());
-
-  std::mutex mu;
-  std::lock_guard<std::mutex> lk(mu);
-
-  auto it = comm_table.find(key);
-
-  if (it->second == nullptr) {
-    auto* comm = new Communicator(gpus);
-    PADDLE_ENFORCE(
-        ncclCommInitAll(comm->comms_.data(), gpus.size(), gpus.data()));
-
-    for (size_t i = 0; i < gpus.size(); ++i) {
-      platform::SetDeviceId(gpus[i]);
-
-      // block wait
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(
-          &comm->events_[i], cudaEventBlockingSync | cudaEventDisableTiming));
-    }
-    comm_table[key].reset(comm);
-  }
-  return comm_table[key].get();
-}
-
-}  // namespace operators
+namespace platform {}  // namespace platform
 }  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index d10688b127..2b7510de1c 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -65,65 +65,30 @@ class WaitGroup {
   std::condition_variable cv_;
 };
 
-// TODO(dzh) : make resources managed unified with framework
 struct Communicator {
   std::vector<ncclComm_t> comms_;
-  std::vector<cudaStream_t> streams_;
-  std::vector<cudaEvent_t> events_;
-  std::vector<int> gpus_;
-  WaitGroup wg_;
-  int root_gpu = -1;
-  // cudaEvent_t root_monitor;
-  explicit Communicator(const std::vector<int>& gpus) : gpus_(gpus) {
+  std::unordered_map<int, int> comm_id_map_;
+
+  int GetCommId(int device_id) const { return comm_id_map_.at(device_id); }
+
+  void InitAll(const std::vector<int>& gpus) {
     comms_.resize(gpus.size());
-    streams_.resize(gpus.size());
-    events_.resize(gpus.size());
+    for (size_t i = 0; i < gpus.size(); ++i) {
+      comm_id_map_[gpus[i]] = i;
+    }
+    PADDLE_ENFORCE(ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
   }
 
   ~Communicator() {
-    for (size_t i = 0; i < gpus_.size(); ++i) {
-      int gid = gpus_[i];
-      platform::SetDeviceId(gid);
-
-      int idx = gid % gpus_.size();
-      // wait finish
-      PADDLE_ENFORCE(
-          cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
-
-      PADDLE_ENFORCE(cudaEventDestroy(comm->events_[idx]));
-
-      PADDLE_ENFORCE(ncclCommDestroy(comm->comms_[idx]));
+    for (size_t i = 0; i < comms_.size(); ++i) {
+      PADDLE_ENFORCE(ncclCommDestroy(comms_[i]));
     }
   }
 
-  inline int get_root_gpu() const { return root_gpu; }
-
-  inline void set_root_gpu(int id) { root_gpu = id; }
+  // DISABLE_COPY_AND_ASSIGN(Communicator);
 };
 
-class NCCLManager {
- public:
-  static NCCLManager* Get() {
-    static NCCLManager m;
-    return &m;
-  }
-
-  NCCLManager();
-
-  ~NCCLManager();
-
-  // for each card only have one communicator
-  Communicator* GetCommunicator(const std::vector<int>& gpus);
-
- private:
-  // // the gpu id list available. Note that only support
-  // // whole world communication.
-  // std::vector<int> _gpu_worlds;
-
-  // communicator list
-  std::unordered_map<std::string /* key*/, std::unique_ptr<Communicator>>
-      comm_table;
-};
+Communicator* NewCommunicator(const std::vector<int>& gpus);
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_ops.cu b/paddle/operators/nccl/nccl_ops.cu
deleted file mode 100644
index eabe5f1729..0000000000
--- a/paddle/operators/nccl/nccl_ops.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/nccl/nccl_ops.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
\ No newline at end of file
diff --git a/paddle/operators/nccl/nccl_ops.h b/paddle/operators/nccl/nccl_ops.h
deleted file mode 100644
index a7a74a0e41..0000000000
--- a/paddle/operators/nccl/nccl_ops.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-
-#include <string.h>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename Type>
-class NCCLTypeWrapper;
-
-template <>
-class NCCLTypeWrapper<float> {
- public:
-  static const ncclDataType_t type = ncclFloat;
-};
-
-template <>
-class NCCLTypeWrapper<double> {
- public:
-  static const ncclDataType_t type = ncclDouble;
-};
-
-class NCCLInitOp : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto gpus = ctx.Input<std::vector<int>>("gpus");
-    auto* comm = ctx.Output<Communicator>("Communicator");
-    comm->mutable_data<Communicator>(CPUPlace());
-    comm = NCCLManager::GetCommunicator(gpus);
-  }
-};
-
-template <typename T>
-class NCCLAllReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto outs = ctx.MultiOutput<Tensor>("Out");
-    std::string reduction = ctx.Attr<std::string>("reduction");
-    std::vector<int> gpus = ctx.Attr<std::vector<int>>("gpus");
-    ncclRedOp_t op_type;
-    if (reduction == "ncclSum") {
-      op_type = ncclSum;
-    } else if (reduction == "ncclProd") {
-      op_type = ncclProd;
-    } else if (reduction == "ncclMin") {
-      op_type = ncclMin;
-    } else if (reduction == "ncclMax") {
-      op_type = ncclMax;
-    }
-
-    auto* comm = ctx.Input<Communicator>("Communicator");
-
-    auto dev_ctx =
-        static_cast<const platform::CUDADeviceContext>(ctx.device_context());
-
-    // platform::NCCLManager* m = platform::NCCLManager::Get();
-
-    // auto* comm = m->GetCommunicator(gpus);
-    // comm->wg_.Add(1);
-
-    auto stream = dev_ctx.stream();
-
-    // device id
-    int gid = static_cast<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
-    int idx = gid % gpus.size();
-    comm->streams_[idx] = stream;
-
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE(
-          ncclAllReduce(ins[i]->data<T>(), outs[i]->mutable_data<T>(),
-                        outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type,
-                        op_type, comm->comms_[idx], comm->streams_[idx]));
-      PADDLE_ENFORCE(cudaEventRecord(comm->events_[idx], comm->streams_[idx]));
-
-      // // wait finish
-      // PADDLE_ENFORCE(
-      //     cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
-    }
-
-    // comm->wg_.Done();
-
-    // comm->wg_.Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_ops.cc b/paddle/operators/nccl_op.cc
similarity index 73%
rename from paddle/operators/nccl/nccl_ops.cc
rename to paddle/operators/nccl_op.cc
index 5cad44dc9f..91584a377e 100644
--- a/paddle/operators/nccl/nccl_ops.cc
+++ b/paddle/operators/nccl_op.cc
@@ -9,7 +9,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/nccl/nccl_ops.h"
+#include "paddle/operators/nccl_op.h"
 
 namespace paddle {
 namespace operators {
@@ -85,31 +85,36 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-// // BcastSendOp
-// class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
-//  public:
-//   NCCLAllReduceOpMaker(framework::OpProto *proto,
-//                        framework::OpAttrChecker *op_checker)
-//       : OpProtoAndCheckerMaker(proto, op_checker) {
-//     AddInput("X", "The input of BcastSend op");
-//     AddComment(R"DOC(
-//             BcastSend the tensors.
-//         )DOC");
-//   }
-// };
+// BcastOp
+class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLAllBcastOpMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of Bcast op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddInput("root", "root gpu of Bcast");
+    AddComment(R"DOC(
+            Bcast the tensors.
+        )DOC");
+  }
+};
 
-// // BcastRecvOp
-// class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
-//  public:
-//   NCCLAllReduceOpMaker(framework::OpProto *proto,
-//                        framework::OpAttrChecker *op_checker)
-//       : OpProtoAndCheckerMaker(proto, op_checker) {
-//     AddOutput("Out", "The output of BcastRecv op");
-//     AddComment(R"DOC(
-//             BcastRecv the tensors.
-//         )DOC");
-//   }
-// };
+// BcastRecvOp
+class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLReduceOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of Reduce op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddInput("root", "root gpu of Reduce");
+    AddOutput("Out", "The output of Reduce op");
+    AddComment(R"DOC(
+            Reduce the tensors.
+        )DOC");
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
@@ -117,3 +122,5 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
                              ops::NCCLAllReduceOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclInit, ops::NCCLInitOp, ops::NCCLInitOpMaker);
+REGISTER_OP_CPU_KERNEL(ncclInit, ops::NCCLInitKernel<float>);
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
new file mode 100644
index 0000000000..6b0a325d17
--- /dev/null
+++ b/paddle/operators/nccl_op.cu
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/nccl_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class NCCLAllReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto outs = ctx.MultiOutput<Tensor>("Out");
+    std::string reduction = ctx.Attr<std::string>("reduction");
+    ncclRedOp_t op_type;
+    if (reduction == "ncclSum") {
+      op_type = ncclSum;
+    } else if (reduction == "ncclProd") {
+      op_type = ncclProd;
+    } else if (reduction == "ncclMin") {
+      op_type = ncclMin;
+    } else if (reduction == "ncclMax") {
+      op_type = ncclMax;
+    } else {
+      PADDLE_ENFORCE(false, "reduction error.");
+    }
+
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+
+    // device id
+    int device_id =
+        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(device_id);
+
+    for (size_t i = 0; i < ins.size(); ++i) {
+      PADDLE_ENFORCE(ncclAllReduce(
+          ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
+          outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type, op_type,
+          comm->comms_[idx], stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
diff --git a/paddle/operators/nccl_op.h b/paddle/operators/nccl_op.h
new file mode 100644
index 0000000000..09606c4acd
--- /dev/null
+++ b/paddle/operators/nccl_op.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+
+#include <string.h>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::Communicator;
+
+template <typename Type>
+class NCCLTypeWrapper;
+
+template <>
+class NCCLTypeWrapper<float> {
+ public:
+  static const ncclDataType_t type = ncclFloat;
+};
+
+template <>
+class NCCLTypeWrapper<double> {
+ public:
+  static const ncclDataType_t type = ncclDouble;
+};
+
+template <typename T>
+class NCCLInitKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* gpus = ctx.Input<std::vector<int>>("gpus");
+    auto* comm = ctx.Output<Communicator>("Communicator");
+    comm->InitAll(*gpus);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/test_nccl_ops.py b/python/paddle/v2/framework/tests/test_nccl_ops.py
index 9bfa4c74d4..6dd6231aa8 100644
--- a/python/paddle/v2/framework/tests/test_nccl_ops.py
+++ b/python/paddle/v2/framework/tests/test_nccl_ops.py
@@ -5,13 +5,15 @@ from paddle.v2.framework.op import Operator
 import paddle.v2.framework.core as core
 from op_test import OpTest, create_op, set_input
 
-gpu_list = os.environ["NV_LIST"]
+# gpu_list = os.environ["NV_LIST"]
+gpu_list = "0,1,2,3"
 
 if not core.is_compile_gpu() or not gpu_list:
     exit(0)
 
 
-def allreduce(tensors, num_device):
+def allreduce(tensors, gpus):
+    num_device = len(gpus)
     assert (len(tensors) == num_device), "not match of tensor and device"
     Out = tensors
     for i in range(1, len(tensors)):
@@ -24,23 +26,32 @@ def allreduce(tensors, num_device):
 
 
 class TestNCCLAllReduce(unittest.TestCase):
-    def __init__(self):
-        self.op_type = "nnclAllReduce"
+    def setUp(self):
 
-        self.gpus = [int(g) for g in gpu_list]
+        self.op_type = "ncclAllReduce"
 
+        self.gpus = [int(g) for g in gpu_list.split(",")]
+
+        self.g_scope = core.Scope()
+        self.g_ctx = core.DeviceContext.create(core.CPUPlace())
         self.scopes = []
         self.ops = []
         self.places = []
 
         self.input_data = []
+
         for i in range(len(self.gpus)):
-            input_data.append(np.random.random((32, 32)))
-        self.output_data = allreduce(input_data)
+            self.input_data.append(np.random.random((32, 32)))
+        self.output_data = allreduce(self.input_data, self.gpus)
+
+        nccl_init = Operator("ncclInit", Out="Communicator", gpus=self.gpus)
+        op.run(self.g_scope, self.g_ctx)
 
         for i in range(len(self.gpus)):
-            scope = core.Scope()
+            # insert kid scope
+            scope = self.g_scope.new_scope()
             place = core.GPUPlace(self.gpus[i])
+
             inputs = {"X": self.input_data[i]}
             outputs = {"Out": self.output_data[i]}
             attrs = {"gpus": self.gpus}
@@ -66,8 +77,11 @@ class TestNCCLAllReduce(unittest.TestCase):
             self.assertTrue(actual, expect), "has diff"
 
 
-if __name__ == "__main__":
-    # usage : export NV_LIST=0,1,2,3 python *.py
+# if __name__ == "__main__":
+#     unittest.main()
+# usage : export NV_LIST=0,1,2,3 python *.py
+
+# os.environ["NV_LIST"] = ["0,1,2,3"]
 
-    os.environ["NV_LIST"] = ["0,1,2,3"]
+if __name__ == "__main__":
     unittest.main()

From 00ad7512cf21b35df7658011a2d5b680cd3d1f19 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Fri, 20 Oct 2017 15:23:48 +0800
Subject: [PATCH 19/81] Use stream while memory::Copy in GPU mode

---
 paddle/operators/seq_expand_op.cc |  2 +-
 paddle/operators/seq_expand_op.h  | 38 ++++++++++++++++++++++++-------
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index b9633721e2..7add3d60f6 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -40,7 +40,7 @@ class SeqExpandOp : public framework::OperatorWithKernel {
       out_dim[0] = out_dim[0] * repeat;
     }
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of PadOp should not be null.");
+                   "Output(Out) of SeqExpandOp should not be null.");
     ctx->SetOutputDim("Out", out_dim);
   }
 };
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index e990f12512..d1dcc97920 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -75,15 +75,37 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     // copy data
-    Place place = boost::get<Place>(context.GetPlace());
+    auto place = context.GetPlace();
     size_t count = 0;
-    for (size_t i = 0; i < scales.size(); ++i) {
-      count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
-      for (size_t j = 0; j < scales[i]; ++j) {
-        memory::Copy(place, out_data, place, x_data, sizeof(T) * count);
-        out_data += count;
+    if (platform::is_cpu_place(place)) {
+      auto& cpu_place = boost::get<platform::CPUPlace>(place);
+      for (size_t i = 0; i < scales.size(); ++i) {
+        count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
+        for (size_t j = 0; j < scales[i]; ++j) {
+          memory::Copy(cpu_place, out_data, cpu_place, x_data,
+                       sizeof(T) * count);
+          out_data += count;
+        }
+        x_data += count;
       }
-      x_data += count;
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto& gpu_place = boost::get<platform::GPUPlace>(place);
+      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                        context.device_context())
+                        .stream();
+      for (size_t i = 0; i < scales.size(); ++i) {
+        count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
+        for (size_t j = 0; j < scales[i]; ++j) {
+          memory::Copy(gpu_place, out_data, gpu_place, x_data,
+                       sizeof(T) * count, stream);
+          out_data += count;
+        }
+        x_data += count;
+      }
+#else
+      PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
     }
 
     out->set_lod(out_lod);
@@ -113,7 +135,7 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
       Eigen::TensorMap<Eigen::Tensor<T, 1>> d_x_t(
           d_x_data, static_cast<int>((ele_count * element_len) / repeat));
       auto place = context.GetEigenDevice<Place>();
-      d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({0}));
+      d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
       d_out_data += (ele_count * element_len);
       d_x_data += ((ele_count * element_len) / repeat);
     }

From d697b6a3497dc7d72f29f0696f23d2d38e349581 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 23 Oct 2017 14:17:15 +0800
Subject: [PATCH 20/81] Modified code using LoDTensor

---
 paddle/framework/lod_tensor.cc                | 14 ++----
 paddle/framework/lod_tensor.h                 |  2 +-
 paddle/operators/seq_expand_op.cc             | 10 ++---
 paddle/operators/seq_expand_op.h              | 45 ++++++++++++-------
 python/paddle/v2/framework/tests/op_test.py   |  2 +
 .../v2/framework/tests/test_seq_expand.py     | 38 ++++++++++------
 6 files changed, 65 insertions(+), 46 deletions(-)

diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 49d9e56689..6f1e1b870b 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -103,25 +103,19 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   lod_ = new_lod;
 }
 
-Vector<size_t> expand_lod(Vector<size_t> level, Vector<size_t> starts,
+Vector<size_t> expand_lod(Vector<size_t> level, Vector<size_t> indexes,
                           Vector<size_t> scales, bool repeat) {
   Vector<size_t> result;
   result.push_back(level[0]);
-  size_t p = 0, start = 0, end = 0;
+  size_t start = 0, end = 0;
   if (!repeat) {
     for (size_t i = 0; i < scales.size(); ++i) {
       result.push_back(result.back() + scales[i] * (level[i + 1] - level[i]));
     }
   } else {
     for (size_t i = 0; i < scales.size(); ++i) {
-      while (starts[i] != level[p] && p < level.size()) {
-        ++p;
-      }
-      start = p;
-      while (starts[i + 1] != level[p] && p < level.size()) {
-        ++p;
-      }
-      end = p + 1;
+      start = indexes[i];
+      end = indexes[i + 1];
       for (size_t j = 0; j < scales[i]; ++j) {
         for (size_t index = start; index < end - 1; ++index) {
           result.push_back(result.back() + level[index + 1] - level[index]);
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index af5e9f8abc..4d1ec29f60 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -123,7 +123,7 @@ class LoDTensor : public Tensor {
   LoD lod_;
 };
 
-Vector<size_t> expand_lod(Vector<size_t> level, Vector<size_t> starts,
+Vector<size_t> expand_lod(Vector<size_t> level, Vector<size_t> indexes,
                           Vector<size_t> scales, bool repeat);
 
 }  // namespace framework
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index 7add3d60f6..d02a94d164 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -77,15 +77,15 @@ by lod of input(Y) or 'repeat' attribute.
 Case 1:
 
 Given a 2-level LoDTensor X:
-    X.data = [1, 2 , 3, 4]
+    X.data = [a, b , c, d]
     X.lod = [[0, 3, 4], [0, 1, 3, 4]]
 and
     repeat = 2
 then we get 3-level LoDTensor
-    Out.data = [1, 2, 3, 1, 2, 3, 4, 4]
-    Out.lod = [[0, 6, 8],
-               [0, 3, 6, 7, 8],
-               [0, 1, 3, 4, 6, 7, 8]]
+    Out.lod = [[0,                6,    8],
+               [0,       3,       6, 7, 8],
+               [0, 1,    3, 4,    6, 7, 8]]
+    Out.data = [a, b, c, a, b, c, d, d]
 
 Case 2:
 
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index d1dcc97920..e31f60db49 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -33,15 +33,12 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     auto x_dims = x->dims();
     auto x_lod = x->lod();
 
-    if (x_lod.size() == 0) {
-      framework::Vector<size_t> level;
-      for (int i = 0; i < x->dims()[0] + 1; ++i) {
-        level.push_back(i);
-      }
-      x_lod.push_back(level);
-    } else {
-      x_lod.insert(x_lod.begin(), x_lod[0]);
+    framework::Vector<size_t> level;
+    size_t num = (x_lod.size() == 0) ? (x->dims()[0] + 1) : x_lod[0].size();
+    for (int i = 0; i < num; ++i) {
+      level.push_back(i);
     }
+    x_lod.push_back(level);
 
     size_t repeat = static_cast<size_t>(context.Attr<int>("repeat"));
     framework::Vector<size_t> scales;
@@ -56,19 +53,27 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     } else {
       auto* y = context.Input<LoDTensor>("Y");
       auto y_lod = y->lod();
-      for (int i = 0; i < y_lod[0].size() - 1; ++i) {
-        scales.push_back((y_lod[0][i + 1] - y_lod[0][i]) /
-                         (x_lod[0][i + 1] - x_lod[0][i]));
+      auto y_abs_lod = y_lod.ToAbsOffset();
+      auto x_abs_lod = x_lod.ToAbsOffset();
+      for (int i = 0; i < y_abs_lod[0].size() - 1; ++i) {
+        scales.push_back((y_abs_lod[0][i + 1] - y_abs_lod[0][i]) /
+                         (x_abs_lod[0][i + 1] - x_abs_lod[0][i]));
       }
       out->Resize(y->dims());
     }
 
+    framework::Vector<size_t> indexes;
+    for (int size_t i = 0; i < x_lod[0]; ++i) {
+      indexes[i] = x_lod[0];
+    }
     framework::LoD out_lod;
-    auto level0 = framework::expand_lod(x_lod[0], x_lod[0], scales, false);
+    auto level0 = framework::expand_lod(indexes, x_lod[0], scales, false);
     out_lod.push_back(level0);
     for (int i = 1; i < x_lod.size(); ++i) {
-      out_lod.push_back(
-          framework::expand_lod(x_lod[i], x_lod[0], scales, true));
+      for (int j = 0; j < indexes.size(); ++j) {
+        indexes[j] = x_lod[i - 1][indexes[j]];
+      }
+      out_lod.push_back(framework::expand_lod(x_lod[i], indexes, scales, true));
     }
 
     size_t element_len = framework::product(x_dims) / x_dims[0];
@@ -80,7 +85,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     if (platform::is_cpu_place(place)) {
       auto& cpu_place = boost::get<platform::CPUPlace>(place);
       for (size_t i = 0; i < scales.size(); ++i) {
-        count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
+        count = element_len * (x_abs_lod[0][i + 1] - x_abs_lod[0][i]);
         for (size_t j = 0; j < scales[i]; ++j) {
           memory::Copy(cpu_place, out_data, cpu_place, x_data,
                        sizeof(T) * count);
@@ -95,7 +100,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
                         context.device_context())
                         .stream();
       for (size_t i = 0; i < scales.size(); ++i) {
-        count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
+        count = element_len * (x_abs_lod[0][i + 1] - x_abs_lod[0][i]);
         for (size_t j = 0; j < scales[i]; ++j) {
           memory::Copy(gpu_place, out_data, gpu_place, x_data,
                        sizeof(T) * count, stream);
@@ -109,6 +114,11 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     }
 
     out->set_lod(out_lod);
+    for (size_t i = 0; i < lod.size; i++) {
+      for (size_t j = 0; j < lod[i].size(); j++) {
+        LOG(INFO) << "lod[" << i << "][" << j "] = " << lod[i][j];
+      }
+    }
   }
 };
 
@@ -121,13 +131,14 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
     auto* out = context.Input<LoDTensor>("Out");
     auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
     auto out_lod = out->lod();
+    auto out_abs_lod = out_lod.ToAbsOffset();
     d_x->set_lod(x->lod());
     const T* d_out_data = d_out->data<T>();
     auto d_out_dims = d_out->dims();
     T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
     size_t element_len = framework::product(d_out_dims) / d_out_dims[0];
     for (size_t i = 0; i < out->NumElements(); ++i) {
-      size_t ele_count = out_lod[0][i + 1] - out_lod[0][i];
+      size_t ele_count = out_abs_lod[0][i + 1] - out_abs_lod[0][i];
       size_t repeat = out->NumElements(0, i);
       Eigen::TensorMap<Eigen::Tensor<const T, 2>> d_out_t(
           d_out_data, static_cast<int>(repeat),
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index a88e9f0bb8..f3108d5108 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -246,6 +246,8 @@ class OpTest(unittest.TestCase):
             else:
                 actual = np.array(self.scope.find_var(out_name).get_tensor())
                 expect = self.outputs[out_name]
+                print "actual= %s" % actual
+                print "expect = %s" % expect
                 self.assertTrue(
                     np.allclose(
                         actual, expect, atol=atol),
diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/framework/tests/test_seq_expand.py
index 87e39d72bf..2910af6b78 100644
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
@@ -27,7 +27,15 @@ def repeat_array(array, starts, times):
     return newlist
 
 
+def toAbsOffset(lod):
+    for i in range(len(lod) - 2, -1, -1):
+        for j in range(len(lod[i])):
+            lod[i][j] = lod[i + 1][lod[i][j]]
+    return lod
+
+
 class TestSeqExpand(OpTest):
+    #class TestSeqExpand():
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
         self.inputs = {'X': x_data}
@@ -35,23 +43,26 @@ class TestSeqExpand(OpTest):
 
     def compute(self):
         x = self.inputs['X']
+        print "x= %s" % x
         x_data, x_lod = x if type(x) == tuple else (x, None)
-        if not x_lod:
-            x_lod = [[i for i in range(1 + x_data.shape[0])]]
-        else:
-            x_lod = [x_lod[0]] + x_lod
+        n = 1 + x_data.shape[0] if not x_lod else len(x_lod[0])
+        x_lod = [[i for i in range(n)]] + x_lod
+        x_abs_lod = toAbsOffset(x_lod)
         if self.repeat:
+            print "repeat= %s" % self.repeat
             self.attrs = {'repeat': self.repeat}
             repeats = (len(x_lod[0]) - 1) * [self.repeat]
         else:
             y_data, y_lod = self.inputs['Y']
-            repeats = [((y_lod[0][i + 1] - y_lod[0][i]) /
-                        (x_lod[0][i + 1] - x_lod[0][i]))
-                       for i in range(len(y_lod[0]) - 1)]
-        out_lod = [repeat(x_lod[0], x_lod[0], repeats, True)] + [
-            repeat(lod, x_lod[0], repeats, False) for lod in x_lod[1:]
-        ]
-        out = repeat_array(x_data.tolist(), x_lod[0], repeats)
+            print "y_lod: %s" % y_lod
+            y_abs_lod = toAbsOffset(y_lod)
+            repeats = [((y_abs_lod[0][i + 1] - y_abs_lod[0][i]) /
+                        (x_abs_lod[0][i + 1] - x_abs_lod[0][i]))
+                       for i in range(len(y_abs_lod[0]) - 1)]
+        #out_lod = [repeat(x_lod[0], x_lod[0], repeats, True)] + [
+        #    repeat(lod, x_lod[0], repeats, False) for lod in x_lod[1:]
+        #]
+        out = repeat_array(x_data.tolist(), x_abs_lod[0], repeats)
         self.outputs = {'Out': out}
 
     def setUp(self):
@@ -69,7 +80,7 @@ class TestSeqExpand(OpTest):
 class TestSeqExpandCase1(TestSeqExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [7, 1]).astype('float32')
-        x_lod = [[0, 5, 7], [0, 2, 5, 7]]
+        x_lod = [[0, 2, 3], [0, 2, 5, 7]]
         self.inputs = {'X': (x_data, x_lod)}
         self.repeat = 2
 
@@ -95,10 +106,11 @@ class TestSeqExpandCase4(TestSeqExpand):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
         x_lod = [[0, 2, 5]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
-        y_lod = [[0, 4, 13], [0, 2, 4, 7, 10, 13]]
+        y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
         self.repeat = None
 
 
 if __name__ == '__main__':
     unittest.main()
+#    TestSeqExpandCase4().setUp()

From 423d7438a1960b4314fff0db873197acd92ec5c3 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Mon, 23 Oct 2017 14:03:17 -0700
Subject: [PATCH 21/81] "add register gpu macro"

---
 paddle/framework/op_registry.h                |  4 +
 paddle/operators/CMakeLists.txt               |  4 +-
 paddle/operators/nccl/CMakeLists.txt          |  2 +-
 paddle/operators/nccl_op.cc                   | 81 +++++++++++++++++--
 paddle/operators/nccl_op.cu                   | 77 ++++++++++++++++++
 .../v2/framework/tests/test_nccl_reduce_op.py |  6 ++
 6 files changed, 165 insertions(+), 9 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_nccl_reduce_op.py

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 226e8ddcd4..6ab65ef5e7 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -233,6 +233,10 @@ class OpKernelRegistrar : public Registrar {
   USE_OP_ITSELF(op_type);        \
   USE_OP_DEVICE_KERNEL(op_type, CPU);
 
+#define USE_GPU_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);        \
+  USE_OP_DEVICE_KERNEL(op_type, GPU)
+
 #define USE_OP(op_type)   \
   USE_OP_ITSELF(op_type); \
   USE_OP_KERNEL(op_type)
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 4faf9bbb08..0ea1037a7b 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -80,8 +80,8 @@ function(op_library TARGET)
     if ("${TARGET}" STREQUAL "nccl_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(ncclInit);\n")
-        # file(APPEND ${pybind_file} "USE_OP(ncclInit);\n")
+        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclInit);\n")
+        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
     endif()
     
     # reduce_op contains several operators
diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/operators/nccl/CMakeLists.txt
index bdd873b3f3..21cc1d9ee9 100644
--- a/paddle/operators/nccl/CMakeLists.txt
+++ b/paddle/operators/nccl/CMakeLists.txt
@@ -1,4 +1,4 @@
 if(WITH_GPU)
-  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator)
+  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
   nv_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
 endif()
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 91584a377e..f0f7b205b6 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -67,6 +67,54 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
   }
 };
 
+// ReduceOp
+class NCCLReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Reduce op input should not be NULL");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Communicator"),
+        " Input(Communicator) of Reduce op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Input(X) of Reduce op input should not be NULL");
+
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+// BcastSendOp
+class NCCLBcastSendOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
+                   " Input(Communicator) of Bcast op input should not be NULL");
+  }
+};
+
+// BcastRecvOp
+class NCCLBcastRecvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
+                   " Input(Communicator) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Output(Out) of Bcast op output should not be NULL");
+  }
+};
+
 // AllreduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -85,15 +133,31 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+// BcastSend should be in the root
+// BcastSendOp
+class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLAllBcastSendOpMaker(framework::OpProto *proto,
+                          framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of BcastSend op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddAttr<int>("root", "root gpu of Bcast");
+    AddComment(R"DOC(
+            Bcast the tensors.
+        )DOC");
+  }
+};
+
 // BcastOp
-class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
+class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLAllBcastOpMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  NCCLAllBcastRecvOpMaker(framework::OpProto *proto,
+                          framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of Bcast op");
     AddInput("Communicator", "Communicator for communicating between gpus");
-    AddInput("root", "root gpu of Bcast");
+    AddAttr<int>("root", "root gpu of BcastRecv");
+    AddOutput("Out", "The output of Bcast");
     AddComment(R"DOC(
             Bcast the tensors.
         )DOC");
@@ -108,7 +172,6 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of Reduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
-    AddInput("root", "root gpu of Reduce");
     AddOutput("Out", "The output of Reduce op");
     AddComment(R"DOC(
             Reduce the tensors.
@@ -123,4 +186,10 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
                              ops::NCCLAllReduceOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclInit, ops::NCCLInitOp, ops::NCCLInitOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcastSend, ops::NCCLBcastSendOp,
+                             ops::NCCLBcastSendOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcastRecv, ops::NCCLBcastRecvOp,
+                             ops::NCCLBcastRecvOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
+                             ops::NCCLReduceOpMaker);
 REGISTER_OP_CPU_KERNEL(ncclInit, ops::NCCLInitKernel<float>);
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index 6b0a325d17..4d91a3055f 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
+#include <functional>
+
 #include "paddle/operators/nccl_op.h"
 
 namespace paddle {
@@ -59,8 +61,83 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class NCCLReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto outs = ctx.MultiOutput<Tensor>("Out");
+
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    // device id
+    int device_id =
+        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(device_id);
+
+    for (size_t i = 0; i < ins.size(); ++i) {
+      int root = std::hash() % comm->comms_.size();
+      T* recvbuffer = nullptr;
+      if (root == device_id) {
+        recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
+      }
+      PADDLE_ENFORCE(ncclReduce(ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
+                                NCCLTypeWrapper<T>::type, root, ncclSum,
+                                comm->comms_[idx], stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+    }
+  }
+};
+
+template <typename T>
+class NCCLBcastKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    int root = ctx.Attr<int>("root");
+
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    // device id
+    int device_id =
+        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(device_id);
+    if (idx == root) {
+      auto ins = ctx.MultiInput<Tensor>("X");
+      for (size_t i = 0; i < ins.size(); ++i) {
+        PADDLE_ENFORCE(ncclBcast((void*)ins[i]->data<T>(), ins[i]->numel(),
+                                 NCCLTypeWrapper<T>::type, root,
+                                 comm->comms_[idx], stream));
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      }
+    } else {
+      auto outs = ctx.MultiOutput<Tensor>("Out");
+      for (size_t i = 0; i < outs.size(); ++i) {
+        PADDLE_ENFORCE(ncclBcast((void*)outs[i]->mutable_data<T>(),
+                                 outs[i]->numel(), NCCLTypeWrapper<T>::type,
+                                 root, comm->comms_[idx], stream));
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclBcastSend, ops::NCCLBcastKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclBcastRecv, ops::NCCLBcastKernel<float>);
diff --git a/python/paddle/v2/framework/tests/test_nccl_reduce_op.py b/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
new file mode 100644
index 0000000000..675ad5766c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
@@ -0,0 +1,6 @@
+import unittest, os
+import numpy as np
+import paddle.v2 as paddle
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+from op_test import OpTest, create_op, set_input

From ec47565c23f872d5f8c1607b7c44c5e3d155c676 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Mon, 23 Oct 2017 14:53:17 -0700
Subject: [PATCH 22/81] "add reduce hash function"

---
 paddle/framework/operator.h |  9 +++++++++
 paddle/operators/nccl_op.cc | 11 ++++-------
 paddle/operators/nccl_op.cu | 29 +++++++++--------------------
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index cf15f9933a..8cdb07e677 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -289,6 +289,15 @@ class ExecutionContext {
     return device_context_;
   }
 
+  //! Get a input which has multiple variables.
+  const std::vector<std::string>& Inputs(const std::string& name) const {
+    return op_.Inputs(name);
+  }
+  //! Get an output which has multiple variables.
+  const std::vector<std::string>& Outputs(const std::string& name) const {
+    return op_.Outputs(name);
+  }
+
 #ifdef PADDLE_WITH_CUDA
   const platform::CUDADeviceContext& cuda_device_context() const {
     PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index f0f7b205b6..89dedfc158 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -81,9 +81,6 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
         " Input(Communicator) of Reduce op input should not be NULL");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    " Input(X) of Reduce op input should not be NULL");
-
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -137,8 +134,8 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 // BcastSendOp
 class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLAllBcastSendOpMaker(framework::OpProto *proto,
-                          framework::OpAttrChecker *op_checker)
+  NCCLBcastSendOpMaker(framework::OpProto *proto,
+                       framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of BcastSend op");
     AddInput("Communicator", "Communicator for communicating between gpus");
@@ -152,8 +149,8 @@ class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
 // BcastOp
 class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLAllBcastRecvOpMaker(framework::OpProto *proto,
-                          framework::OpAttrChecker *op_checker)
+  NCCLBcastRecvOpMaker(framework::OpProto *proto,
+                       framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddAttr<int>("root", "root gpu of BcastRecv");
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index 4d91a3055f..5f8e0a886b 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -2,8 +2,8 @@
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
+http://www.apache.org/licenseshashernless required by applicable law or agreed
+to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
@@ -27,25 +27,12 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     auto ins = ctx.MultiInput<Tensor>("X");
     auto outs = ctx.MultiOutput<Tensor>("Out");
     std::string reduction = ctx.Attr<std::string>("reduction");
-    ncclRedOp_t op_type;
-    if (reduction == "ncclSum") {
-      op_type = ncclSum;
-    } else if (reduction == "ncclProd") {
-      op_type = ncclProd;
-    } else if (reduction == "ncclMin") {
-      op_type = ncclMin;
-    } else if (reduction == "ncclMax") {
-      op_type = ncclMax;
-    } else {
-      PADDLE_ENFORCE(false, "reduction error.");
-    }
 
     auto* comm = ctx.Input<Communicator>("Communicator");
 
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                       ctx.device_context())
                       .stream();
-
     // device id
     int device_id =
         boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
@@ -54,7 +41,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < ins.size(); ++i) {
       PADDLE_ENFORCE(ncclAllReduce(
           ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
-          outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type, op_type,
+          outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type, ncclSum,
           comm->comms_[idx], stream));
       PADDLE_ENFORCE(cudaStreamSynchronize(stream));
     }
@@ -68,7 +55,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
 
-    auto ins = ctx.MultiInput<Tensor>("X");
+    auto ins = ctx.MultiInput<Tensor>("X");  // x0, x1, x2
     auto outs = ctx.MultiOutput<Tensor>("Out");
 
     auto* comm = ctx.Input<Communicator>("Communicator");
@@ -81,14 +68,16 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
         boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(device_id);
 
+    auto ins_names = ctx.Inputs("X");
+    std::hash<std::string> hasher;
     for (size_t i = 0; i < ins.size(); ++i) {
-      int root = std::hash() % comm->comms_.size();
+      int root = hasher(ins_names[i]) % comm->comms_.size();
       T* recvbuffer = nullptr;
       if (root == device_id) {
         recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
       }
       PADDLE_ENFORCE(ncclReduce(ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
-                                NCCLTypeWrapper<T>::type, root, ncclSum,
+                                NCCLTypeWrapper<T>::type, ncclSum, root,
                                 comm->comms_[idx], stream));
       PADDLE_ENFORCE(cudaStreamSynchronize(stream));
     }
@@ -124,7 +113,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     } else {
       auto outs = ctx.MultiOutput<Tensor>("Out");
       for (size_t i = 0; i < outs.size(); ++i) {
-        PADDLE_ENFORCE(ncclBcast((void*)outs[i]->mutable_data<T>(),
+        PADDLE_ENFORCE(ncclBcast(outs[i]->mutable_data<T>(ctx.GetPlace()),
                                  outs[i]->numel(), NCCLTypeWrapper<T>::type,
                                  root, comm->comms_[idx], stream));
         PADDLE_ENFORCE(cudaStreamSynchronize(stream));

From 50f04dcae37f1574db482fdc65d53aaabdef6778 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Mon, 23 Oct 2017 17:13:31 -0700
Subject: [PATCH 23/81] "add init allreduce test"

---
 paddle/operators/CMakeLists.txt               |   3 +-
 paddle/operators/nccl/nccl_gpu_common.h       |  43 +------
 paddle/operators/nccl_op.cc                   |   7 +-
 paddle/operators/nccl_op.cu                   |  20 ++--
 .../framework/tests/test_nccl_allreduce_op.py | 106 ++++++++++++++++++
 5 files changed, 125 insertions(+), 54 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_nccl_allreduce_op.py

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 2574e93419..5da637dd7d 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -80,8 +80,8 @@ function(op_library TARGET)
     if ("${TARGET}" STREQUAL "nccl_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclInit);\n")
         file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(ncclInit);\n")
     endif()
     
     # reduce_op contains several operators
@@ -148,7 +148,6 @@ foreach(src ${GENERAL_OPS})
 endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
-message(STATUS "operators_list: ${OP_LIBRARY}")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 2b7510de1c..648693508d 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -23,48 +23,12 @@
 #include <vector>
 
 #include "paddle/platform/device_context.h"
+#include "paddle/platform/dynload/nccl.h"
 #include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 
-class WaitGroup {
- public:
-  inline void Add(int n) {
-    std::unique_lock<std::mutex> lk(mu_);
-    PADDLE_ENFORCE(n >= 0, "add wait must >=0.");
-    counter_ += n;
-  }
-
-  inline void Done(int n) {
-    std::unique_lock<std::mutex> lk(mu_);
-    PADDLE_ENFORCE(n <= counter_, " wait group done unmatch to add.");
-    counter_ -= n;
-    if (counter_ == 0) {
-      cv_.notify_all();
-    }
-  }
-
-  inline void Add() { Add(1); }
-
-  inline void Done() { Done(1); }
-
-  inline void Wait() {
-    std::unique_lock<std::mutex> lk(mu_);
-    cv_.wait(lk, [&] { return counter_ == 0; });
-  }
-
-  inline int GetCount() {
-    std::unique_lock<std::mutex> lk(mu_);
-    return counter_;
-  }
-
- private:
-  int counter_ = 0;
-  std::mutex mu_;
-  std::condition_variable cv_;
-};
-
 struct Communicator {
   std::vector<ncclComm_t> comms_;
   std::unordered_map<int, int> comm_id_map_;
@@ -76,12 +40,13 @@ struct Communicator {
     for (size_t i = 0; i < gpus.size(); ++i) {
       comm_id_map_[gpus[i]] = i;
     }
-    PADDLE_ENFORCE(ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
+    PADDLE_ENFORCE(
+        dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
   }
 
   ~Communicator() {
     for (size_t i = 0; i < comms_.size(); ++i) {
-      PADDLE_ENFORCE(ncclCommDestroy(comms_[i]));
+      PADDLE_ENFORCE(dynload::ncclCommDestroy(comms_[i]));
     }
   }
 
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 89dedfc158..ee6ed0ae85 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -21,8 +21,9 @@ class NCCLInitOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Communicator"),
-                   " Input(X) of AllReduce op input should not be NULL");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Communicator"),
+        " Output(Communicator) of ncclInit op input should not be NULL");
   }
 };
 
@@ -123,7 +124,7 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output of AllReduce op");
     AddAttr<std::string>("reduction",
                          "{'ncclmin', 'ncclmax', 'ncclprod', 'ncclsum'}.");
-    AddAttr<std::vector<int>>("gpus", "gpu id lists");
+    // AddAttr<std::vector<int>>("gpus", "gpu id lists");
     AddComment(R"DOC(
             AllReduce the input tensors.
         )DOC");
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index 5f8e0a886b..ee19a69afc 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -39,7 +39,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     int idx = comm->GetCommId(device_id);
 
     for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE(ncclAllReduce(
+      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
           ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
           outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type, ncclSum,
           comm->comms_[idx], stream));
@@ -76,9 +76,9 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
       if (root == device_id) {
         recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
       }
-      PADDLE_ENFORCE(ncclReduce(ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
-                                NCCLTypeWrapper<T>::type, ncclSum, root,
-                                comm->comms_[idx], stream));
+      PADDLE_ENFORCE(platform::dynload::ncclReduce(
+          ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
+          NCCLTypeWrapper<T>::type, ncclSum, root, comm->comms_[idx], stream));
       PADDLE_ENFORCE(cudaStreamSynchronize(stream));
     }
   }
@@ -105,17 +105,17 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     if (idx == root) {
       auto ins = ctx.MultiInput<Tensor>("X");
       for (size_t i = 0; i < ins.size(); ++i) {
-        PADDLE_ENFORCE(ncclBcast((void*)ins[i]->data<T>(), ins[i]->numel(),
-                                 NCCLTypeWrapper<T>::type, root,
-                                 comm->comms_[idx], stream));
+        PADDLE_ENFORCE(platform::dynload::ncclBcast(
+            (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
+            root, comm->comms_[idx], stream));
         PADDLE_ENFORCE(cudaStreamSynchronize(stream));
       }
     } else {
       auto outs = ctx.MultiOutput<Tensor>("Out");
       for (size_t i = 0; i < outs.size(); ++i) {
-        PADDLE_ENFORCE(ncclBcast(outs[i]->mutable_data<T>(ctx.GetPlace()),
-                                 outs[i]->numel(), NCCLTypeWrapper<T>::type,
-                                 root, comm->comms_[idx], stream));
+        PADDLE_ENFORCE(platform::dynload::ncclBcast(
+            outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
+            NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
         PADDLE_ENFORCE(cudaStreamSynchronize(stream));
       }
     }
diff --git a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
new file mode 100644
index 0000000000..0e6927a24d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
@@ -0,0 +1,106 @@
+import unittest, os
+import numpy as np
+import paddle.v2 as paddle
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+from op_test import OpTest, create_op, set_input
+
+# gpu_list = os.environ["NV_LIST"]
+gpu_list = "0,1,2,3"
+
+if not core.is_compile_gpu() or not gpu_list:
+    exit(0)
+
+g_scope = core.Scope()
+g_ctx = core.DeviceContext.create(core.CPUPlace())
+
+
+class TestNCCLInit(OpTest):
+    def setUp(self):
+        self.op_type = "ncclInit"
+        self.gpus = [int(g) for g in gpu_list.split(",")]
+
+        self.attrs = {"gpus": self.gpus}
+        self.scope = g_scope.var("Communicator")
+        self.outputs = {"Communicator": self.scope.var("Communicator")}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestNCCLAllReduce(unittest.TestCase):
+    def setUp(self):
+        # cpu allreduce for check
+        def allreduce(tensors, gpus):
+            num_device = len(gpus)
+            assert (
+                len(tensors) == num_device), "not match of tensor and device"
+            Out = tensors
+            for i in range(1, len(tensors)):
+                Out[0] += Out[i]
+
+            for i in range(1, len(tensors)):
+                Out[i] = Out[0]
+
+            return Out
+
+        self.op_type = "ncclAllReduce"
+
+        self.gpus = [int(g) for g in gpu_list.split(",")]
+
+        self.g_scope = core.Scope()
+        self.g_ctx = core.DeviceContext.create(core.CPUPlace())
+        self.scopes = []
+        self.ops = []
+        self.places = []
+
+        self.input_data = []
+
+        for i in range(len(self.gpus)):
+            self.input_data.append(np.random.random((32, 32)))
+        self.output_data = allreduce(self.input_data, self.gpus)
+
+        nccl_init = Operator("ncclInit", Out="Communicator", gpus=self.gpus)
+        nccl_init.run(self.g_scope, self.g_ctx)
+
+        for i in range(len(self.gpus)):
+            # insert kid scope
+            scope = self.g_scope.new_scope()
+            place = core.GPUPlace(self.gpus[i])
+
+            inputs = {
+                "X": self.input_data[i],
+                "Communicator": scope.find_var("Communicator")
+            }
+            outputs = {"Out": self.output_data[i]}
+            # attrs = {"gpus": self.gpus}
+
+            op = create_op(scope, self.op_type, inputs, outputs, attrs)
+            set_input(scope, op, inputs, place)
+
+            self.scopes.append(scope)
+            self.ops.append(op)
+            self.places.append(place)
+
+    def test_output(self):
+        idx = 0
+        for scope, place, op in zip(self.scopes, self.places, self.ops):
+            ctx = core.DeviceContext.create(place)
+            op.run(scope, ctx)
+
+        for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
+            actual = np.array(scope.find_var(out_name).get_tensor())
+            expect = self.output_data[idx]
+
+            idx += 1
+            self.assertTrue(actual, expect), "has diff"
+
+
+# if __name__ == "__main__":
+#     unittest.main()
+# usage : export NV_LIST=0,1,2,3 python *.py
+
+# os.environ["NV_LIST"] = ["0,1,2,3"]
+
+if __name__ == "__main__":
+    unittest.main()

From 296167446c35228c7e259677d82a3c85b896a7b5 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Tue, 24 Oct 2017 14:10:02 +0800
Subject: [PATCH 24/81] Rewrite sequence expand op

---
 paddle/framework/lod_tensor.cc                |  23 ----
 paddle/framework/lod_tensor.h                 |   3 -
 paddle/operators/seq_expand_op.cc             | 109 +++++++--------
 paddle/operators/seq_expand_op.h              | 128 +++++-------------
 python/paddle/v2/framework/tests/op_test.py   |   2 -
 .../v2/framework/tests/test_seq_expand.py     |  96 +++----------
 6 files changed, 97 insertions(+), 264 deletions(-)

diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index a7b2b5b1ec..7c0ea0df78 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -112,28 +112,5 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   lod_ = new_lod;
 }
 
-Vector<size_t> expand_lod(Vector<size_t> level, Vector<size_t> indexes,
-                          Vector<size_t> scales, bool repeat) {
-  Vector<size_t> result;
-  result.push_back(level[0]);
-  size_t start = 0, end = 0;
-  if (!repeat) {
-    for (size_t i = 0; i < scales.size(); ++i) {
-      result.push_back(result.back() + scales[i] * (level[i + 1] - level[i]));
-    }
-  } else {
-    for (size_t i = 0; i < scales.size(); ++i) {
-      start = indexes[i];
-      end = indexes[i + 1];
-      for (size_t j = 0; j < scales[i]; ++j) {
-        for (size_t index = start; index < end - 1; ++index) {
-          result.push_back(result.back() + level[index + 1] - level[index]);
-        }
-      }
-    }
-  }
-  return result;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index ec0b34878b..3895d3cb83 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -136,8 +136,5 @@ class LoDTensor : public Tensor {
   LoD lod_;
 };
 
-Vector<size_t> expand_lod(Vector<size_t> level, Vector<size_t> indexes,
-                          Vector<size_t> scales, bool repeat);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index d02a94d164..660e86e9cc 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -27,20 +27,14 @@ class SeqExpandOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SeqExpandOp should not be null.");
-    int repeat = ctx->Attrs().Get<int>("repeat");
-    framework::DDim out_dim;
-    if (repeat == 0) {
-      PADDLE_ENFORCE(
-          ctx->HasInput("Y"),
-          "Input(Y) of SeqExpandOp should not be null while repeat == 0.");
-      out_dim = ctx->GetInputDim("Y");
-      ctx->ShareLoD("Y", "Out");
-    } else {
-      out_dim = ctx->GetInputDim("X");
-      out_dim[0] = out_dim[0] * repeat;
-    }
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SeqExpandOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Y"),
+        "Input(Y) of SeqExpandOp should not be null while repeat == 0.");
+    framework::DDim out_dim;
+    out_dim = ctx->GetInputDim("Y");
+    ctx->ShareLoD("Y", "Out");
     ctx->SetOutputDim("Out", out_dim);
   }
 };
@@ -50,68 +44,63 @@ class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
   SeqExpandOpMaker(framework::OpProto* proto,
                    framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "The input('X') of seq_expand op. It can be LoDTensor or base Tensor.");
-    AddInput(
-        "Y",
-        "The reference input('Y') of seq_expand op."
-        "It must be a LoDTensor with k-level(k>0)."
-        "This reference input is essential if 'repeat' attribute is not "
-        "configured."
-        "Input(X) will be expanded by LoD of input(Y) while repeat ==  0.");
+    AddInput("X",
+             "(Tensor or LoDTensor) The input('X') of this operator can be a "
+             "LoDTensor or a base Tensor.");
+    AddInput("Y",
+             "(LoDTensor)The reference input('Y') of seq_expand op."
+             "It must be a LoDTensor with k-level(k>0)."
+             "Input(X) will be expanded according to LOD of input(Y)."
+             "The element numbers of last level in input('Y') "
+             "must be equal to dims[0] of input('X').");
     AddOutput("Out",
               "The output of seq_expand op."
-              "The output is a (k+1)-level LoDTensor"
-              "while input(X) being k-level LoDTensor."
-              "(Given base tensor is 0-level LoDTensor.)");
-    AddAttr<int>("repeat",
-                 "(type:int; default value: 0)"
-                 "Repeatting times of each element while expanding input(X)."
-                 "It works while input(Y) is not configured.")
-        .SetDefault(0);
+              "The lod of output will be as same as input(Y)'s lod.");
     AddComment(R"DOC(
-Expand k-level LoDTensor to (k+1)-level LoDTensor
-by lod of input(Y) or 'repeat' attribute.
+Expand input(X) according to LOD of input(Y).
 
 Case 1:
 
-Given a 2-level LoDTensor X:
-    X.data = [a, b , c, d]
-    X.lod = [[0, 3, 4], [0, 1, 3, 4]]
-and
-    repeat = 2
-then we get 3-level LoDTensor
-    Out.lod = [[0,                6,    8],
-               [0,       3,       6, 7, 8],
-               [0, 1,    3, 4,    6, 7, 8]]
-    Out.data = [a, b, c, a, b, c, d, d]
+Given 2-level a LoDTensor input(X)
+    X.lod = [[0,       2, 3],
+             [0, 1,    3, 4]]
+    X.data = [a, b, c, d]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0,    2,    4],
+             [0, 3, 6, 7, 8]]
+then we get 2-level LoDTensor
+    Out.lod = [[0,                2,    4],
+               [0,       3,       6, 7, 8]]
+    Out.data = [a, a, a, b, b, b, c, d]
+    Out.dims = [8, 1]
 
 Case 2:
 
-Given 2-level a LoDTensor X
-    X.data = [1, 2, 3, 4]
-    X.lod = [[0, 3, 4], [0, 1, 3, 4]]
-and
-    Y.lod = [[0, 6, 8],
-             [0, 3, 6, 7, 8],
-             [0,1,3,4,6,7,8]]
-then we get 3-level LoDTensor
-    Out.data = [1, 2, 3, 1, 2, 3, 4, 4]
-    Out.lod = [[0, 6, 8],
-               [0, 3, 6, 7, 8],
-               [0, 1, 3, 4, 6, 7, 8]]
+Given a 0-level LoDTensor input(X)
+    X.data = [a, b, c]
+    X.lod = NULL
+    X.dims = [3, 1]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
+then we get 1-level LoDTensor
+    Out.lod = [[0,    2, 3,      6]]
+    Out.data = [a, a, b, c, c, c]
+    Out.dims = [6, 1]
 
 Case 3:
 
-Given a 0-level LoDTensor X
-    X.data = [1, 2, 3, 4]
+Given a 0-level LoDTensor input(X)
+    X.data = [[a, b], [c, d], [e, f]]
     X.lod = NULL
-and
-    repeat = 2
+    X.dims = [3, 2]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
 then we get 1-level LoDTensor
-    Out.data = [1, 1, 2, 2, 3, 3, 4, 4]
-    Out.lod = [[0, 2, 4, 6, 8]]
+    Out.lod = [[0,           2,     3,                     6]]
+    Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]]
+    Out.dims = [6, 2]
+
 
 )DOC");
   }
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index e31f60db49..ad3f42116d 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -31,93 +31,28 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     auto* out = context.Output<LoDTensor>("Out");
     const T* x_data = x->data<T>();
     auto x_dims = x->dims();
-    auto x_lod = x->lod();
-
-    framework::Vector<size_t> level;
-    size_t num = (x_lod.size() == 0) ? (x->dims()[0] + 1) : x_lod[0].size();
-    for (int i = 0; i < num; ++i) {
-      level.push_back(i);
-    }
-    x_lod.push_back(level);
-
-    size_t repeat = static_cast<size_t>(context.Attr<int>("repeat"));
-    framework::Vector<size_t> scales;
-    if (repeat != 0) {
-      for (int i = 0; i < x_lod[0].size() - 1; ++i) {
-        scales.push_back(repeat);
-      }
-      std::vector<int64_t> dims = framework::vectorize(x->dims());
-      dims[0] = dims[0] * repeat;
-      auto out_dims = framework::make_ddim(dims);
-      out->Resize(out_dims);
-    } else {
-      auto* y = context.Input<LoDTensor>("Y");
-      auto y_lod = y->lod();
-      auto y_abs_lod = y_lod.ToAbsOffset();
-      auto x_abs_lod = x_lod.ToAbsOffset();
-      for (int i = 0; i < y_abs_lod[0].size() - 1; ++i) {
-        scales.push_back((y_abs_lod[0][i + 1] - y_abs_lod[0][i]) /
-                         (x_abs_lod[0][i + 1] - x_abs_lod[0][i]));
-      }
-      out->Resize(y->dims());
-    }
-
-    framework::Vector<size_t> indexes;
-    for (int size_t i = 0; i < x_lod[0]; ++i) {
-      indexes[i] = x_lod[0];
-    }
-    framework::LoD out_lod;
-    auto level0 = framework::expand_lod(indexes, x_lod[0], scales, false);
-    out_lod.push_back(level0);
-    for (int i = 1; i < x_lod.size(); ++i) {
-      for (int j = 0; j < indexes.size(); ++j) {
-        indexes[j] = x_lod[i - 1][indexes[j]];
-      }
-      out_lod.push_back(framework::expand_lod(x_lod[i], indexes, scales, true));
-    }
-
+    auto* y = context.Input<LoDTensor>("Y");
+    PADDLE_ENFORCE_EQ(x_dims[0], y->lod().back().size() - 1,
+                      "The size of last lod level in Input(Y)"
+                      "must be equal to dims[0] of Input(X).");
+    out->set_lod(y->lod());
+    out->Resize(y->dims());
+    auto place = context.GetEigenDevice<Place>();
     size_t element_len = framework::product(x_dims) / x_dims[0];
     T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    // copy data
-    auto place = context.GetPlace();
-    size_t count = 0;
-    if (platform::is_cpu_place(place)) {
-      auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      for (size_t i = 0; i < scales.size(); ++i) {
-        count = element_len * (x_abs_lod[0][i + 1] - x_abs_lod[0][i]);
-        for (size_t j = 0; j < scales[i]; ++j) {
-          memory::Copy(cpu_place, out_data, cpu_place, x_data,
-                       sizeof(T) * count);
-          out_data += count;
-        }
-        x_data += count;
-      }
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      auto& gpu_place = boost::get<platform::GPUPlace>(place);
-      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                        context.device_context())
-                        .stream();
-      for (size_t i = 0; i < scales.size(); ++i) {
-        count = element_len * (x_abs_lod[0][i + 1] - x_abs_lod[0][i]);
-        for (size_t j = 0; j < scales[i]; ++j) {
-          memory::Copy(gpu_place, out_data, gpu_place, x_data,
-                       sizeof(T) * count, stream);
-          out_data += count;
-        }
-        x_data += count;
-      }
-#else
-      PADDLE_THROW("Paddle is not compiled with GPU");
-#endif
-    }
-
-    out->set_lod(out_lod);
-    for (size_t i = 0; i < lod.size; i++) {
-      for (size_t j = 0; j < lod[i].size(); j++) {
-        LOG(INFO) << "lod[" << i << "][" << j "] = " << lod[i][j];
-      }
+    auto out_starts = out->lod().back();
+
+    for (size_t i = 0; i < out_starts.size() - 1; i++) {
+      int scale = out_starts[i + 1] - out_starts[i];
+      Eigen::TensorMap<
+          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          x_t(x_data, 1, element_len);
+      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          out_t(out_data, scale, element_len);
+      Eigen::array<int, 2> cast({scale, 1});
+      out_t.device(place) = x_t.broadcast(cast);
+      x_data += element_len;
+      out_data += element_len * scale;
     }
   }
 };
@@ -130,25 +65,24 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
     auto* x = context.Input<LoDTensor>("X");
     auto* out = context.Input<LoDTensor>("Out");
     auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto out_lod = out->lod();
-    auto out_abs_lod = out_lod.ToAbsOffset();
+    auto out_last_level = out->lod().back();
     d_x->set_lod(x->lod());
     const T* d_out_data = d_out->data<T>();
     auto d_out_dims = d_out->dims();
     T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
     size_t element_len = framework::product(d_out_dims) / d_out_dims[0];
-    for (size_t i = 0; i < out->NumElements(); ++i) {
-      size_t ele_count = out_abs_lod[0][i + 1] - out_abs_lod[0][i];
-      size_t repeat = out->NumElements(0, i);
-      Eigen::TensorMap<Eigen::Tensor<const T, 2>> d_out_t(
-          d_out_data, static_cast<int>(repeat),
-          static_cast<int>((ele_count * element_len) / repeat));
-      Eigen::TensorMap<Eigen::Tensor<T, 1>> d_x_t(
-          d_x_data, static_cast<int>((ele_count * element_len) / repeat));
+
+    for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
+      size_t repeat = out_last_level[i + 1] - out_last_level[i];
+      Eigen::TensorMap<
+          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+      d_out_t(d_out_data, static_cast<int>(repeat), element_len);
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
+      d_x_t(d_x_data, static_cast<int>(element_len));
       auto place = context.GetEigenDevice<Place>();
       d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
-      d_out_data += (ele_count * element_len);
-      d_x_data += ((ele_count * element_len) / repeat);
+      d_out_data += (repeat * element_len);
+      d_x_data += element_len;
     }
   }
 };
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index f3108d5108..a88e9f0bb8 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -246,8 +246,6 @@ class OpTest(unittest.TestCase):
             else:
                 actual = np.array(self.scope.find_var(out_name).get_tensor())
                 expect = self.outputs[out_name]
-                print "actual= %s" % actual
-                print "expect = %s" % expect
                 self.assertTrue(
                     np.allclose(
                         actual, expect, atol=atol),
diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/framework/tests/test_seq_expand.py
index 2910af6b78..901102802b 100644
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
@@ -3,66 +3,21 @@ import numpy as np
 from op_test import OpTest
 
 
-def repeat(list, starts, times, is_first):
-    newlist = [list[0]]
-    if is_first:
-        for i, time in enumerate(times):
-            size = list[i + 1] - list[i]
-            newlist.append(newlist[-1] + size * time)
-    else:
-        for i, time in enumerate(times):
-            start = list.index(starts[i])
-            end = list.index(starts[i + 1]) + 1
-            for t in range(time):
-                for index in range(start, end - 1):
-                    newlist.append(newlist[-1] + list[index + 1] - list[index])
-    return newlist
-
-
-def repeat_array(array, starts, times):
-    newlist = []
-    for i, time in enumerate(times):
-        for t in range(time):
-            newlist.extend(array[starts[i]:starts[i + 1]])
-    return newlist
-
-
-def toAbsOffset(lod):
-    for i in range(len(lod) - 2, -1, -1):
-        for j in range(len(lod[i])):
-            lod[i][j] = lod[i + 1][lod[i][j]]
-    return lod
-
-
 class TestSeqExpand(OpTest):
-    #class TestSeqExpand():
     def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        self.inputs = {'X': x_data}
-        self.repeat = 2
+        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        y_lod = [[0, 1, 4, 8]]
+        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
 
     def compute(self):
         x = self.inputs['X']
-        print "x= %s" % x
         x_data, x_lod = x if type(x) == tuple else (x, None)
         n = 1 + x_data.shape[0] if not x_lod else len(x_lod[0])
-        x_lod = [[i for i in range(n)]] + x_lod
-        x_abs_lod = toAbsOffset(x_lod)
-        if self.repeat:
-            print "repeat= %s" % self.repeat
-            self.attrs = {'repeat': self.repeat}
-            repeats = (len(x_lod[0]) - 1) * [self.repeat]
-        else:
-            y_data, y_lod = self.inputs['Y']
-            print "y_lod: %s" % y_lod
-            y_abs_lod = toAbsOffset(y_lod)
-            repeats = [((y_abs_lod[0][i + 1] - y_abs_lod[0][i]) /
-                        (x_abs_lod[0][i + 1] - x_abs_lod[0][i]))
-                       for i in range(len(y_abs_lod[0]) - 1)]
-        #out_lod = [repeat(x_lod[0], x_lod[0], repeats, True)] + [
-        #    repeat(lod, x_lod[0], repeats, False) for lod in x_lod[1:]
-        #]
-        out = repeat_array(x_data.tolist(), x_abs_lod[0], repeats)
+        y_data, y_lod = self.inputs['Y']
+        repeats = [((y_lod[-1][i + 1] - y_lod[-1][i]))
+                   for i in range(len(y_lod[-1]) - 1)]
+        out = x_data.repeat(repeats, axis=0)
         self.outputs = {'Out': out}
 
     def setUp(self):
@@ -78,39 +33,22 @@ class TestSeqExpand(OpTest):
 
 
 class TestSeqExpandCase1(TestSeqExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [7, 1]).astype('float32')
-        x_lod = [[0, 2, 3], [0, 2, 5, 7]]
-        self.inputs = {'X': (x_data, x_lod)}
-        self.repeat = 2
-
-
-class TestSeqExpandCase2(TestSeqExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        self.inputs = {'X': x_data}
-        self.repeat = 2
-
-
-class TestSeqExpandCase3(TestSeqExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
-        y_lod = [[0, 1, 4, 8]]
-        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
-        self.repeat = None
-
-
-class TestSeqExpandCase4(TestSeqExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
         x_lod = [[0, 2, 5]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
         y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-        self.repeat = None
+
+
+class TestSeqExpandCase2(TestSeqExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
+        x_lod = [[0, 1]]
+        y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
+        y_lod = [[0, 2]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
 if __name__ == '__main__':
     unittest.main()
-#    TestSeqExpandCase4().setUp()

From ef257e6d96e5b99710a9d63e11a6642163f4e018 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Tue, 24 Oct 2017 11:11:22 -0700
Subject: [PATCH 25/81] write nccl c++ test case

---
 paddle/operators/CMakeLists.txt               |   4 +
 paddle/operators/nccl/CMakeLists.txt          |   1 -
 paddle/operators/nccl/nccl_gpu_common.h       |   2 -
 paddle/operators/nccl/nccl_gpu_common_test.cc |  33 -----
 paddle/operators/nccl_op.cc                   |  27 ++--
 paddle/operators/nccl_op.cu                   |   1 -
 paddle/operators/nccl_op.h                    |   4 +-
 paddle/operators/nccl_op_test.cc              |  71 ++++++++++
 paddle/operators/nccl_op_test.cu              |  71 ++++++++++
 paddle/pybind/pybind.cc                       |  13 +-
 .../v2/framework/tests/test_multigpu.py       |   8 ++
 .../framework/tests/test_nccl_allreduce_op.py | 122 +++++++++---------
 .../v2/framework/tests/test_nccl_init_op.py   |  36 ++++++
 .../v2/framework/tests/test_nccl_reduce_op.py |  19 +++
 14 files changed, 298 insertions(+), 114 deletions(-)
 delete mode 100644 paddle/operators/nccl/nccl_gpu_common_test.cc
 create mode 100644 paddle/operators/nccl_op_test.cc
 create mode 100644 paddle/operators/nccl_op_test.cu
 create mode 100644 python/paddle/v2/framework/tests/test_multigpu.py
 create mode 100644 python/paddle/v2/framework/tests/test_nccl_init_op.py

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 5da637dd7d..0f2122b4b0 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -154,3 +154,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
 cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc DEPS dynamic_recurrent_op recurrent_op tensor_array)
+
+if(WITH_GPU)
+  nv_test(nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context)
+endif()
diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/operators/nccl/CMakeLists.txt
index 21cc1d9ee9..ce0ddd89bf 100644
--- a/paddle/operators/nccl/CMakeLists.txt
+++ b/paddle/operators/nccl/CMakeLists.txt
@@ -1,4 +1,3 @@
 if(WITH_GPU)
   nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
-  nv_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
 endif()
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 648693508d..f492f96aa8 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -53,7 +53,5 @@ struct Communicator {
   // DISABLE_COPY_AND_ASSIGN(Communicator);
 };
 
-Communicator* NewCommunicator(const std::vector<int>& gpus);
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_gpu_common_test.cc b/paddle/operators/nccl/nccl_gpu_common_test.cc
deleted file mode 100644
index 6f6a4ac886..0000000000
--- a/paddle/operators/nccl/nccl_gpu_common_test.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-
-#include <gtest/gtest.h>
-
-#include <chrono>
-#include <thread>
-#include <vector>
-
-namespace paddle {
-namespace platform {
-
-TEST(WaitGroup, wait) {
-  WaitGroup wg;
-  auto run_thread = [&wg](int idx) {
-    wg.Add(1);
-    std::this_thread::sleep_for(std::chrono::seconds(1));
-    wg.Done();
-  };
-
-  std::vector<std::thread> ths;
-  constexpr const int TNUM = 5;
-  for (int i = 0; i < TNUM; ++i) {
-    ths.emplace_back(std::thread(run_thread, i));
-  }
-  wg.Wait();
-
-  for (int i = 0; i < TNUM; ++i) {
-    ths[i].join();
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index ee6ed0ae85..6213f23613 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -21,9 +21,14 @@ class NCCLInitOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Communicator"),
-        " Output(Communicator) of ncclInit op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Communicator"),
+                   " Output(Communicator) of ncclInitOp should not be NULL");
+  }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext &ctx) const override {
+    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
   }
 };
 
@@ -32,9 +37,11 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
   NCCLInitOpMaker(framework::OpProto *proto,
                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("gpus", "gpu id lists");
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
+    AddAttr<std::vector<int>>("gpus", "gpu id lists");
+    AddAttr<int>("data_type", "output data type")
+        .SetDefault(framework::DataType::FP32);
     AddComment(R"DOC(
                create communicator.
         )DOC");
@@ -58,10 +65,10 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputsDim("X");
 
-    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
-                    reduction == "ncclMin" || reduction == "ncclMax"),
-                   "invalid reduction.");
+    // std::string reduction = ctx->Attrs().Get<std::string>("reduction");
+    // PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+    //                 reduction == "ncclMin" || reduction == "ncclMax"),
+    //                "invalid reduction.");
 
     ctx->SetOutputsDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -122,8 +129,8 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The input of AllReduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
-    AddAttr<std::string>("reduction",
-                         "{'ncclmin', 'ncclmax', 'ncclprod', 'ncclsum'}.");
+    // AddAttr<std::string>("reduction",
+    //                      "{'ncclmin', 'ncclmax', 'ncclprod', 'ncclsum'}.");
     // AddAttr<std::vector<int>>("gpus", "gpu id lists");
     AddComment(R"DOC(
             AllReduce the input tensors.
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index ee19a69afc..00a115feeb 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -26,7 +26,6 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
 
     auto ins = ctx.MultiInput<Tensor>("X");
     auto outs = ctx.MultiOutput<Tensor>("Out");
-    std::string reduction = ctx.Attr<std::string>("reduction");
 
     auto* comm = ctx.Input<Communicator>("Communicator");
 
diff --git a/paddle/operators/nccl_op.h b/paddle/operators/nccl_op.h
index 09606c4acd..a438e4eaa2 100644
--- a/paddle/operators/nccl_op.h
+++ b/paddle/operators/nccl_op.h
@@ -40,9 +40,9 @@ template <typename T>
 class NCCLInitKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* gpus = ctx.Input<std::vector<int>>("gpus");
+    std::vector<int> gpus = ctx.Attr<std::vector<int>>("gpus");
     auto* comm = ctx.Output<Communicator>("Communicator");
-    comm->InitAll(*gpus);
+    comm->InitAll(gpus);
   }
 };
 
diff --git a/paddle/operators/nccl_op_test.cc b/paddle/operators/nccl_op_test.cc
new file mode 100644
index 0000000000..9c319a3387
--- /dev/null
+++ b/paddle/operators/nccl_op_test.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/nccl_op.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/gpu_info.h"
+
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
+
+static std::vector<int> gpu_list;
+
+using f = paddle::framework;
+using ops = paddle::operators;
+
+void AddOp(const std::string &type, const f::VariableNameMap &inputs,
+           const f::VariableNameMap &outputs, f::AttributeMap attrs,
+           paddle::framework::BlockDescBind *block) {
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(paddle::framework::DataType::FP32);
+    }
+  }
+
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+TEST(NCCL, ncclInitOp) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+}
+
+int main(int argc, char **argv) {
+  static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < gpu_count; ++i) {
+    gpu_list.emplace_back(i);
+  }
+  if (dev_count <= 1) {
+    LOG(WARNING)
+        << "Cannot test multi-gpu nccl, because the CUDA device count is "
+        << dev_count;
+    return 0;
+  }
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
new file mode 100644
index 0000000000..9c319a3387
--- /dev/null
+++ b/paddle/operators/nccl_op_test.cu
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/nccl_op.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/gpu_info.h"
+
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
+
+static std::vector<int> gpu_list;
+
+using f = paddle::framework;
+using ops = paddle::operators;
+
+void AddOp(const std::string &type, const f::VariableNameMap &inputs,
+           const f::VariableNameMap &outputs, f::AttributeMap attrs,
+           paddle::framework::BlockDescBind *block) {
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(paddle::framework::DataType::FP32);
+    }
+  }
+
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+TEST(NCCL, ncclInitOp) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+}
+
+int main(int argc, char **argv) {
+  static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < gpu_count; ++i) {
+    gpu_list.emplace_back(i);
+  }
+  if (dev_count <= 1) {
+    LOG(WARNING)
+        << "Cannot test multi-gpu nccl, because the CUDA device count is "
+        << dev_count;
+    return 0;
+  }
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index b6e44fdbad..e1e382b2bb 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
 #include "paddle/operators/dynamic_recurrent_op.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
@@ -203,6 +204,13 @@ All parameter, weight, gradient are variables in Paddle.
              return self.GetMutable<SelectedRows>();
            },
            py::return_value_policy::reference)
+#ifdef PADDLE_WITH_CUDA
+      .def("get_communicator",
+           [](Variable &self) -> platform::Communicator * {
+             return self.GetMutable<platform::Communicator>();
+           },
+           py::return_value_policy::reference)
+#endif
       .def("get_net",
            [](Variable &self) -> operators::NetOp * {
              return self.GetMutable<operators::NetOp>();
@@ -258,8 +266,11 @@ All parameter, weight, gradient are variables in Paddle.
                     return new paddle::platform::CUDADeviceContext(place);
 #endif
                   });
-  // clang-format on
+// clang-format on
 
+#ifdef PADDLE_WITH_CUDA
+  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
+#endif
   py::class_<platform::GPUPlace>(m, "GPUPlace")
       .def(py::init<int>())
       .def("__str__", string::to_string<const platform::GPUPlace &>);
diff --git a/python/paddle/v2/framework/tests/test_multigpu.py b/python/paddle/v2/framework/tests/test_multigpu.py
new file mode 100644
index 0000000000..b75d274d88
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_multigpu.py
@@ -0,0 +1,8 @@
+import unittest, os
+import numpy as np
+import paddle.v2 as paddle
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+from op_test import OpTest, create_op, set_input
+
+gpu_list = "0,1,2,3"
diff --git a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
index 0e6927a24d..06e079eda8 100644
--- a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
@@ -1,4 +1,5 @@
 import unittest, os
+from threading import Thread
 import numpy as np
 import paddle.v2 as paddle
 from paddle.v2.framework.op import Operator
@@ -13,94 +14,87 @@ if not core.is_compile_gpu() or not gpu_list:
 
 g_scope = core.Scope()
 g_ctx = core.DeviceContext.create(core.CPUPlace())
+gpus = [int(g) for g in gpu_list.split(",")]
 
 
-class TestNCCLInit(OpTest):
-    def setUp(self):
-        self.op_type = "ncclInit"
-        self.gpus = [int(g) for g in gpu_list.split(",")]
-
-        self.attrs = {"gpus": self.gpus}
-        self.scope = g_scope.var("Communicator")
-        self.outputs = {"Communicator": self.scope.var("Communicator")}
+# ground truth
+def allreduce(tensors, gpus):
+    num_device = len(gpus)
+    assert (len(tensors) == num_device), "not match of tensor and device"
+    Out = tensors
+    for i in range(1, len(tensors)):
+        Out[0] += Out[i]
 
-    def test_check_output(self):
-        self.check_output()
+    for i in range(1, len(tensors)):
+        Out[i] = Out[0]
 
+    return Out
 
-class TestNCCLAllReduce(unittest.TestCase):
-    def setUp(self):
-        # cpu allreduce for check
-        def allreduce(tensors, gpus):
-            num_device = len(gpus)
-            assert (
-                len(tensors) == num_device), "not match of tensor and device"
-            Out = tensors
-            for i in range(1, len(tensors)):
-                Out[0] += Out[i]
 
-            for i in range(1, len(tensors)):
-                Out[i] = Out[0]
-
-            return Out
-
-        self.op_type = "ncclAllReduce"
+input_data = [
+    np.random.random((32, 32)).astype("float32") for i in range(len(gpus))
+]
+output_data = allreduce(input_data, gpus)
 
-        self.gpus = [int(g) for g in gpu_list.split(",")]
+# output_vars = [g_scope.var("Out_"+str(i)).get_tensor()
+#                for i in range(len(gpus))]
 
-        self.g_scope = core.Scope()
-        self.g_ctx = core.DeviceContext.create(core.CPUPlace())
-        self.scopes = []
-        self.ops = []
-        self.places = []
 
-        self.input_data = []
+def thread_allreduce_op(thread_id, gpu_id):
+    i = gpu_id
+    scope = g_scope.new_scope()
+    place = core.GPUPlace(gpus[i])
+    inputs = {
+        "X": input_data[i],
+        "Communicator": scope.find_var("Communicator")
+    }
+    outputs = {"Out": output_data[i]}
 
-        for i in range(len(self.gpus)):
-            self.input_data.append(np.random.random((32, 32)))
-        self.output_data = allreduce(self.input_data, self.gpus)
+    op = create_op(scope, "ncclAllReduce", inputs, outputs, attrs={})
+    place = core.GPUPlace(gpus[i])
+    set_input(scope, op, inputs, place)
 
-        nccl_init = Operator("ncclInit", Out="Communicator", gpus=self.gpus)
-        nccl_init.run(self.g_scope, self.g_ctx)
+    ctx = core.DeviceContext.create(place)
 
-        for i in range(len(self.gpus)):
-            # insert kid scope
-            scope = self.g_scope.new_scope()
-            place = core.GPUPlace(self.gpus[i])
+    print "thread_id : ", thread_id, "gpu_id : ", gpu_id, " invoke allreduce"
+    op.run(scope, ctx)
+    print "thread_id : ", thread_id, "gpu_id : ", gpu_id, " allreduce Done."
 
-            inputs = {
-                "X": self.input_data[i],
-                "Communicator": scope.find_var("Communicator")
-            }
-            outputs = {"Out": self.output_data[i]}
-            # attrs = {"gpus": self.gpus}
 
-            op = create_op(scope, self.op_type, inputs, outputs, attrs)
-            set_input(scope, op, inputs, place)
+class TestNCCLAllReduce(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "ncclAllReduce"
 
-            self.scopes.append(scope)
-            self.ops.append(op)
-            self.places.append(place)
+        nccl_init = create_op(
+            g_scope,
+            op_type="ncclInit",
+            inputs={},
+            outputs={
+                "Communicator": g_scope.var("Communicator").get_communicator()
+            },
+            attrs={"gpus": gpus})
+        nccl_init.run(g_scope, g_ctx)
 
     def test_output(self):
-        idx = 0
-        for scope, place, op in zip(self.scopes, self.places, self.ops):
-            ctx = core.DeviceContext.create(place)
-            op.run(scope, ctx)
+        ops = []
+        for i in range(len(gpus)):
+            th = Thread(
+                target=thread_allreduce_op, args=(
+                    i,
+                    gpus[i], ))
+            th.start()
+            ops.append(ops)
+        for th in ops:
+            th.join()
 
+        idx = 0
         for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
             actual = np.array(scope.find_var(out_name).get_tensor())
-            expect = self.output_data[idx]
+            expect = output_data[idx]
 
             idx += 1
             self.assertTrue(actual, expect), "has diff"
 
 
-# if __name__ == "__main__":
-#     unittest.main()
-# usage : export NV_LIST=0,1,2,3 python *.py
-
-# os.environ["NV_LIST"] = ["0,1,2,3"]
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_nccl_init_op.py b/python/paddle/v2/framework/tests/test_nccl_init_op.py
new file mode 100644
index 0000000000..8aed14c15d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_nccl_init_op.py
@@ -0,0 +1,36 @@
+import unittest, os
+import numpy as np
+import paddle.v2 as paddle
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+from op_test import OpTest, create_op, set_input
+
+gpu_list = "0,1,2,3"
+
+if not core.is_compile_gpu() or not gpu_list:
+    exit(0)
+
+g_scope = core.Scope()
+g_ctx = core.DeviceContext.create(core.CPUPlace())
+
+
+class TestNCCLInit(unittest.TestCase):
+    def test_init(self):
+        self.op_type = "ncclInit"
+        self.gpus = [int(g) for g in gpu_list.split(",")]
+
+        self.inputs = {}
+        self.attrs = {"gpus": self.gpus}
+        g_scope.var("Communicator").get_communicator()
+        self.outputs = {"Communicator": g_scope.find_var("Communicator")}
+        nccl_init = create_op(
+            g_scope,
+            op_type=self.op_type,
+            inputs=self.inputs,
+            outputs=self.outputs,
+            attrs=self.attrs)
+        nccl_init.run(g_scope, g_ctx)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_nccl_reduce_op.py b/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
index 675ad5766c..0cee1923a6 100644
--- a/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
@@ -4,3 +4,22 @@ import paddle.v2 as paddle
 from paddle.v2.framework.op import Operator
 import paddle.v2.framework.core as core
 from op_test import OpTest, create_op, set_input
+
+gpu_list = "0,1,2,3"
+g_scope = core.Scope()
+g_ctx = core.DeviceContext.create(core.CPUPlace())
+
+if not core.is_compile_gpu() or not gpu_list:
+    exit(0)
+
+
+class TestNCCLReduce(OpTest):
+    def setUp(self):
+        self.op_type = "ncclReduce"
+        self.gpus = [int(g) for g in gpu_list.split(",")]
+
+        self.scope = g_scope.var("Communicator").get_communicator()
+        self.outputs = {"Communicator": self.scope.var("Communicator")}
+
+    def test_check_output(self):
+        self.check_output()

From 0990c87bf63302ab608005ec7aa2e8dcd37b6b5c Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Tue, 24 Oct 2017 13:43:01 -0700
Subject: [PATCH 26/81] checkin nccl operator

---
 paddle/operators/nccl/nccl_gpu_common.h       |  3 +-
 paddle/operators/nccl_op_test.cc              | 71 -------------------
 paddle/operators/nccl_op_test.cu              | 37 ++++++++--
 paddle/platform/nccl_test.cu                  |  7 +-
 .../framework/tests/test_nccl_allreduce_op.py | 13 ++--
 5 files changed, 42 insertions(+), 89 deletions(-)
 delete mode 100644 paddle/operators/nccl_op_test.cc

diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index f492f96aa8..fe49d19a9d 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -46,7 +46,8 @@ struct Communicator {
 
   ~Communicator() {
     for (size_t i = 0; i < comms_.size(); ++i) {
-      PADDLE_ENFORCE(dynload::ncclCommDestroy(comms_[i]));
+      // FIXME(dzh) : PADDLE_ENFORCE return void
+      dynload::ncclCommDestroy(comms_[i]);
     }
   }
 
diff --git a/paddle/operators/nccl_op_test.cc b/paddle/operators/nccl_op_test.cc
deleted file mode 100644
index 9c319a3387..0000000000
--- a/paddle/operators/nccl_op_test.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/operators/nccl_op.h"
-
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/gpu_info.h"
-
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
-
-static std::vector<int> gpu_list;
-
-using f = paddle::framework;
-using ops = paddle::operators;
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
-    }
-  }
-
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-TEST(NCCL, ncclInitOp) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.Block(0);
-}
-
-int main(int argc, char **argv) {
-  static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < gpu_count; ++i) {
-    gpu_list.emplace_back(i);
-  }
-  if (dev_count <= 1) {
-    LOG(WARNING)
-        << "Cannot test multi-gpu nccl, because the CUDA device count is "
-        << dev_count;
-    return 0;
-  }
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 9c319a3387..15d8bde933 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -16,6 +16,11 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/var_desc.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
@@ -26,8 +31,8 @@
 
 static std::vector<int> gpu_list;
 
-using f = paddle::framework;
-using ops = paddle::operators;
+namespace f = paddle::framework;
+namespace ops = paddle::operators;
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
@@ -50,22 +55,40 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
   op->SetAttrMap(attrs);
 }
 
-TEST(NCCL, ncclInitOp) {
+TEST(NCCL, ncclInit) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op = block->AppendOp();
+
+  paddle::platform::Communicator comm;
+  op->SetType("ncclInit");
+  op->SetOutput("Communicator", )
+
+      AddOp("ncclInit", {}, {{"Communicator", {comm}}}, {{"gpus", {gpu_list}}},
+            block);
 }
 
+// TEST(NCCL, ncclAllReduce) {
+//   f::ProgramDescBind program;
+//   f::BlockDescBind *block = program.Block(0);
+
+//   paddle::platform::Communicator comm;
+//   AddOp("ncclInit", {}, {{"Communicator", {comm}}, {"gpus", {gpu_list}}},
+//   block);
+// }
+
 int main(int argc, char **argv) {
-  static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < gpu_count; ++i) {
-    gpu_list.emplace_back(i);
-  }
+  static int dev_count = paddle::platform::GetCUDADeviceCount();
   if (dev_count <= 1) {
     LOG(WARNING)
         << "Cannot test multi-gpu nccl, because the CUDA device count is "
         << dev_count;
     return 0;
   }
+
+  for (int i = 0; i < dev_count; ++i) {
+    gpu_list.emplace_back(i);
+  }
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index ab8b96f726..c99dae68be 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -31,9 +31,7 @@ namespace platform {
 TEST(NCCL, init) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
-
-  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
-  PADDLE_ENFORCE(status);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
   for (int i = 0; i < dev_count; ++i) {
     dynload::ncclCommDestroy(comms[i]);
   }
@@ -64,8 +62,7 @@ TEST(NCCL, all_reduce) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
   VLOG(1) << "Initializing ncclComm";
-  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
-  PADDLE_ENFORCE(status);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
   VLOG(1) << "ncclComm initialized";
   VLOG(1) << "Creating thread data";
   std::vector<std::unique_ptr<PerThreadData<double>>> data;
diff --git a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
index 06e079eda8..f79dcd664b 100644
--- a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
@@ -53,6 +53,9 @@ def thread_allreduce_op(thread_id, gpu_id):
     op = create_op(scope, "ncclAllReduce", inputs, outputs, attrs={})
     place = core.GPUPlace(gpus[i])
     set_input(scope, op, inputs, place)
+    # # print scope.find_var("Out").get_tensor()
+    # # print scope.find_var("X").get_tensor()
+    print scope.find_var("Communicator").get_communicator()
 
     ctx = core.DeviceContext.create(place)
 
@@ -83,13 +86,13 @@ class TestNCCLAllReduce(unittest.TestCase):
                     i,
                     gpus[i], ))
             th.start()
-            ops.append(ops)
-        for th in ops:
-            th.join()
+            ops.append(th)
+        for t in ops:
+            t.join()
 
         idx = 0
-        for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
-            actual = np.array(scope.find_var(out_name).get_tensor())
+        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+            actual = np.array(g_scope.find_var(out_name).get_tensor())
             expect = output_data[idx]
 
             idx += 1

From 1e8474b9f1290b7d70bd07b497f9d5e9299ef47d Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Tue, 24 Oct 2017 14:25:46 -0700
Subject: [PATCH 27/81] "delete python ops testcase"

---
 paddle/operators/nccl_op_test.cu              | 52 ++++++-----
 .../v2/framework/tests/test_multigpu.py       |  8 --
 .../v2/framework/tests/test_nccl_ops.py       | 87 -------------------
 3 files changed, 29 insertions(+), 118 deletions(-)
 delete mode 100644 python/paddle/v2/framework/tests/test_multigpu.py
 delete mode 100644 python/paddle/v2/framework/tests/test_nccl_ops.py

diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 15d8bde933..a25e01baa4 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -13,8 +13,11 @@
    limitations under the License. */
 #include "paddle/operators/nccl_op.h"
 
-#include "glog/logging.h"
-#include "gtest/gtest.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
 
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_desc.h"
@@ -24,10 +27,13 @@
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
 
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
+USE_CPU_ONLY_OP(ncclInit);
+USE_GPU_ONLY_OP(ncclAllReduce);
+USE_GPU_ONLY_OP(ncclReduce);
+USE_GPU_ONLY_OP(ncclBcastSend);
+USE_GPU_ONLY_OP(ncclBcastRecv);
 
 static std::vector<int> gpu_list;
 
@@ -55,28 +61,28 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
   op->SetAttrMap(attrs);
 }
 
-TEST(NCCL, ncclInit) {
+// ncclInitOp with desc
+TEST(NCCL, ncclInitOp) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.Block(0);
-  f::OpDescBind *op = block->AppendOp();
-
-  paddle::platform::Communicator comm;
-  op->SetType("ncclInit");
-  op->SetOutput("Communicator", )
-
-      AddOp("ncclInit", {}, {{"Communicator", {comm}}}, {{"gpus", {gpu_list}}},
-            block);
+  f::OpDescBind *op1 = block->AppendOp();
+
+  op1->SetType("ncclInit");
+  op1->SetOutput("Communicator", {"x1"});
+  op1->SetAttr("gpus", {gpu_list});
+  f::Scope g_scope;
+  paddle::platform::DeviceContext *ctx =
+      new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
+
+  auto *var = g_scope.Var("x1");
+  var->GetMutable<paddle::platform::Communicator>();
+
+  auto op = f::OpRegistry::CreateOp(*op1);
+  VLOG(1) << "invoke NCCLInitOp.";
+  op->Run(g_scope, *ctx);
+  VLOG(1) << "NCCLInitOp finished.";
 }
 
-// TEST(NCCL, ncclAllReduce) {
-//   f::ProgramDescBind program;
-//   f::BlockDescBind *block = program.Block(0);
-
-//   paddle::platform::Communicator comm;
-//   AddOp("ncclInit", {}, {{"Communicator", {comm}}, {"gpus", {gpu_list}}},
-//   block);
-// }
-
 int main(int argc, char **argv) {
   static int dev_count = paddle::platform::GetCUDADeviceCount();
   if (dev_count <= 1) {
diff --git a/python/paddle/v2/framework/tests/test_multigpu.py b/python/paddle/v2/framework/tests/test_multigpu.py
deleted file mode 100644
index b75d274d88..0000000000
--- a/python/paddle/v2/framework/tests/test_multigpu.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import unittest, os
-import numpy as np
-import paddle.v2 as paddle
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
-from op_test import OpTest, create_op, set_input
-
-gpu_list = "0,1,2,3"
diff --git a/python/paddle/v2/framework/tests/test_nccl_ops.py b/python/paddle/v2/framework/tests/test_nccl_ops.py
deleted file mode 100644
index 6dd6231aa8..0000000000
--- a/python/paddle/v2/framework/tests/test_nccl_ops.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import unittest, os
-import numpy as np
-import paddle.v2 as paddle
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
-from op_test import OpTest, create_op, set_input
-
-# gpu_list = os.environ["NV_LIST"]
-gpu_list = "0,1,2,3"
-
-if not core.is_compile_gpu() or not gpu_list:
-    exit(0)
-
-
-def allreduce(tensors, gpus):
-    num_device = len(gpus)
-    assert (len(tensors) == num_device), "not match of tensor and device"
-    Out = tensors
-    for i in range(1, len(tensors)):
-        Out[0] += Out[i]
-
-    for i in range(1, len(tensors)):
-        Out[i] = Out[0]
-
-    return Out
-
-
-class TestNCCLAllReduce(unittest.TestCase):
-    def setUp(self):
-
-        self.op_type = "ncclAllReduce"
-
-        self.gpus = [int(g) for g in gpu_list.split(",")]
-
-        self.g_scope = core.Scope()
-        self.g_ctx = core.DeviceContext.create(core.CPUPlace())
-        self.scopes = []
-        self.ops = []
-        self.places = []
-
-        self.input_data = []
-
-        for i in range(len(self.gpus)):
-            self.input_data.append(np.random.random((32, 32)))
-        self.output_data = allreduce(self.input_data, self.gpus)
-
-        nccl_init = Operator("ncclInit", Out="Communicator", gpus=self.gpus)
-        op.run(self.g_scope, self.g_ctx)
-
-        for i in range(len(self.gpus)):
-            # insert kid scope
-            scope = self.g_scope.new_scope()
-            place = core.GPUPlace(self.gpus[i])
-
-            inputs = {"X": self.input_data[i]}
-            outputs = {"Out": self.output_data[i]}
-            attrs = {"gpus": self.gpus}
-
-            op = create_op(scope, self.op_type, inputs, outputs, attrs)
-            set_input(scope, op, inputs, place)
-
-            self.scopes.append(scope)
-            self.ops.append(op)
-            self.places.append(place)
-
-    def test_output(self):
-        idx = 0
-        for scope, place, op in zip(self.scopes, self.places, self.ops):
-            ctx = core.DeviceContext.create(place)
-            op.run(scope, ctx)
-
-        for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
-            actual = np.array(scope.find_var(out_name).get_tensor())
-            expect = self.output_data[idx]
-
-            idx += 1
-            self.assertTrue(actual, expect), "has diff"
-
-
-# if __name__ == "__main__":
-#     unittest.main()
-# usage : export NV_LIST=0,1,2,3 python *.py
-
-# os.environ["NV_LIST"] = ["0,1,2,3"]
-
-if __name__ == "__main__":
-    unittest.main()

From 026c61c02700df2481d3e1dd7a2349844197937e Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Tue, 24 Oct 2017 14:27:56 -0700
Subject: [PATCH 28/81] "fix allreduce python test"

---
 python/paddle/v2/framework/tests/test_nccl_allreduce_op.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
index f79dcd664b..0a9163dd55 100644
--- a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
@@ -36,9 +36,6 @@ input_data = [
 ]
 output_data = allreduce(input_data, gpus)
 
-# output_vars = [g_scope.var("Out_"+str(i)).get_tensor()
-#                for i in range(len(gpus))]
-
 
 def thread_allreduce_op(thread_id, gpu_id):
     i = gpu_id
@@ -53,9 +50,6 @@ def thread_allreduce_op(thread_id, gpu_id):
     op = create_op(scope, "ncclAllReduce", inputs, outputs, attrs={})
     place = core.GPUPlace(gpus[i])
     set_input(scope, op, inputs, place)
-    # # print scope.find_var("Out").get_tensor()
-    # # print scope.find_var("X").get_tensor()
-    print scope.find_var("Communicator").get_communicator()
 
     ctx = core.DeviceContext.create(place)
 

From 63fb41b39991608e6ff9da569d956f7ddccb9b50 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Tue, 24 Oct 2017 15:55:52 -0700
Subject: [PATCH 29/81] "redefine the initop from kernel to OpBase"

---
 paddle/framework/operator.h      |  2 +-
 paddle/operators/nccl_op.cc      | 37 ++++++++++++++++++--------------
 paddle/operators/nccl_op.cu      | 21 +++++++++++++++++-
 paddle/operators/nccl_op_test.cu | 34 +++++++++++++++++++++++------
 4 files changed, 70 insertions(+), 24 deletions(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index aca663ffc6..09989c374c 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -125,7 +125,7 @@ class OperatorBase {
  protected:
   std::string type_;
   // NOTE: in case of OpGrad, inputs_ contains:
-  // I (Inputs)opear
+  // I (Inputs)
   // O (Outputs)
   // OG (Output Gradients)
   VariableNameMap inputs_;
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 6213f23613..ec7a89d5ff 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -9,26 +9,30 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/nccl_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"
 
 namespace paddle {
 namespace operators {
 
 // NCCLinitOp
-class NCCLInitOp : public framework::OperatorWithKernel {
+class NCCLInitOp : public framework::OperatorBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Communicator"),
-                   " Output(Communicator) of ncclInitOp should not be NULL");
-  }
-
- protected:
-  framework::DataType IndicateDataType(
-      const framework::ExecutionContext &ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+  NCCLInitOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    const auto &name = Output("Communicator");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
+                            "Can not find variable '%s' in the scope.", name);
+    std::vector<int> gpus = Attr<std::vector<int>>("gpus");
+    PADDLE_ENFORCE(!gpus.empty(), "Attr(gpus) should not be empty.");
+    platform::Communicator *comm =
+        scope.FindVar(name)->GetMutable<platform::Communicator>();
+    comm->InitAll(gpus);
   }
 };
 
@@ -188,13 +192,14 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp,
+                  paddle::framework::EmptyGradOpMaker, ops::NCCLInitOpMaker);
+
 REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
                              ops::NCCLAllReduceOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclInit, ops::NCCLInitOp, ops::NCCLInitOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclBcastSend, ops::NCCLBcastSendOp,
                              ops::NCCLBcastSendOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclBcastRecv, ops::NCCLBcastRecvOp,
                              ops::NCCLBcastRecvOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
                              ops::NCCLReduceOpMaker);
-REGISTER_OP_CPU_KERNEL(ncclInit, ops::NCCLInitKernel<float>);
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index 00a115feeb..4fbdf1ce02 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -12,11 +12,30 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include <functional>
 
-#include "paddle/operators/nccl_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"
 
 namespace paddle {
 namespace operators {
 
+using framework::Tensor;
+using platform::Communicator;
+
+template <typename Type>
+class NCCLTypeWrapper;
+
+template <>
+class NCCLTypeWrapper<float> {
+ public:
+  static const ncclDataType_t type = ncclFloat;
+};
+
+template <>
+class NCCLTypeWrapper<double> {
+ public:
+  static const ncclDataType_t type = ncclDouble;
+};
+
 template <typename T>
 class NCCLAllReduceKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index a25e01baa4..334884d657 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -11,7 +11,6 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-#include "paddle/operators/nccl_op.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -65,11 +64,11 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
 TEST(NCCL, ncclInitOp) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.Block(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::OpDescBind *op_desc = block->AppendOp();
 
-  op1->SetType("ncclInit");
-  op1->SetOutput("Communicator", {"x1"});
-  op1->SetAttr("gpus", {gpu_list});
+  op_desc->SetType("ncclInit");
+  op_desc->SetOutput("Communicator", {"x1"});
+  op_desc->SetAttr("gpus", {gpu_list});
   f::Scope g_scope;
   paddle::platform::DeviceContext *ctx =
       new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
@@ -77,7 +76,30 @@ TEST(NCCL, ncclInitOp) {
   auto *var = g_scope.Var("x1");
   var->GetMutable<paddle::platform::Communicator>();
 
-  auto op = f::OpRegistry::CreateOp(*op1);
+  auto op = f::OpRegistry::CreateOp(*op_desc);
+  VLOG(1) << "invoke NCCLInitOp.";
+  op->Run(g_scope, *ctx);
+  VLOG(1) << "NCCLInitOp finished.";
+}
+
+// ncclAllReduceOp with desc
+TEST(NCCL, ncclInitOp) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op_desc = block->AppendOp();
+
+  op_desc->SetType("ncclAllReduce");
+
+  op_desc->SetOutput("Communicator", {"x1"});
+  op_desc->SetAttr("gpus", {gpu_list});
+  f::Scope g_scope;
+  paddle::platform::DeviceContext *ctx =
+      new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
+
+  auto *var = g_scope.Var("x1");
+  var->GetMutable<paddle::platform::Communicator>();
+
+  auto op = f::OpRegistry::CreateOp(*op_desc);
   VLOG(1) << "invoke NCCLInitOp.";
   op->Run(g_scope, *ctx);
   VLOG(1) << "NCCLInitOp finished.";

From 0f67a8272896bed63efd777133a3cafb6bc572f8 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 25 Oct 2017 15:30:24 +0800
Subject: [PATCH 30/81] add test_Expand and simply the gserver/tests/CMakeLists

---
 paddle/gserver/tests/CMakeLists.txt  | 165 ++++++++-------------------
 paddle/gserver/tests/test_Expand.cpp | 125 ++++++++++++++++++++
 2 files changed, 174 insertions(+), 116 deletions(-)
 create mode 100644 paddle/gserver/tests/test_Expand.cpp

diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 329536afaf..aa94ee406e 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -1,24 +1,29 @@
 # gserver pacakge unittests
 
-if(NOT MOBILE_INFERENCE)
-################### test_ProtoDataProvider ############
-    add_unittest_without_exec(test_ProtoDataProvider
-        test_ProtoDataProvider.cpp)
-
-    # test_ProtoDataProvider will mkdir as same name,
-    # so if WORKING_DIRECTORY is default directory, then
-    # mkdir will get error.
-    add_test(NAME test_ProtoDataProvider
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-endif()
+add_simple_unittest(test_LinearChainCRF)
+add_simple_unittest(test_MultinomialSampler)
+add_simple_unittest(test_RecurrentLayer)
 
-################# test_LayerGrad #######################
-add_unittest_without_exec(test_LayerGrad
-    test_LayerGrad.cpp
-    LayerGradUtil.cpp)
-add_test(NAME test_LayerGrad
-    COMMAND test_LayerGrad)
+function(gserver_test TARGET)
+  add_unittest_without_exec(${TARGET}
+      ${TARGET}.cpp
+      LayerGradUtil.cpp)
+  add_test(NAME ${TARGET}
+      COMMAND ${TARGET})
+endfunction()
+
+gserver_test(test_LayerGrad)
+gserver_test(test_CRFLayerGrad)
+gserver_test(test_CrossEntropyOverBeamGrad)
+gserver_test(test_SeqSliceLayerGrad)
+gserver_test(test_ActivationGrad)
+gserver_test(test_ConvTrans)
+gserver_test(test_PriorBox)
+gserver_test(test_DetectionOutput)
+gserver_test(test_ConvUnify)
+gserver_test(test_BatchNorm)
+gserver_test(test_KmaxSeqScore)
+gserver_test(test_Expand)
 
 ########## test_Mkldnn layers and activations ##########
 if(WITH_MKLDNN)
@@ -32,89 +37,6 @@ if(WITH_MKLDNN)
             WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
-################ test_CRFLayerGrad ####################
-add_unittest_without_exec(test_CRFLayerGrad
-    test_CRFLayerGrad.cpp
-    LayerGradUtil.cpp)
-add_test(NAME test_CRFLayerGrad
-    COMMAND test_CRFLayerGrad)
-
-################ test_CrossEntropyOverBeam ####################
-add_unittest_without_exec(test_CrossEntropyOverBeam
-    test_CrossEntropyOverBeamGrad.cpp
-    LayerGradUtil.cpp)
-add_test(NAME test_CrossEntropyOverBeam
-    COMMAND test_CrossEntropyOverBeam)
-
-################ test_SeqSliceLayerGrad ####################
-add_unittest_without_exec(test_SeqSliceLayerGrad
-    test_SeqSliceLayerGrad.cpp
-    LayerGradUtil.cpp)
-add_test(NAME test_SeqSliceLayerGrad
-    COMMAND test_SeqSliceLayerGrad)
-
-add_unittest_without_exec(test_ActivationGrad
-    test_ActivationGrad.cpp
-    LayerGradUtil.cpp)
-add_test(NAME test_ActivationGrad
-    COMMAND test_ActivationGrad)
-################# test_ConvTrans #######################
-add_unittest_without_exec(test_ConvTrans
-    test_ConvTrans.cpp
-    LayerGradUtil.cpp)
-
-add_test(NAME test_ConvTrans
-    COMMAND test_ConvTrans)
-################# test_PriorBox #######################
-add_unittest_without_exec(test_PriorBox
-    test_PriorBox.cpp
-    LayerGradUtil.cpp)
-
-add_test(NAME test_PriorBox
-    COMMAND test_PriorBox)
-################# test_DetectionOutput #######################
-add_unittest_without_exec(test_DetectionOutput
-    test_DetectionOutput.cpp
-    LayerGradUtil.cpp)
-
-add_test(NAME test_DetectionOutput
-    COMMAND test_DetectionOutput)
-################# test_ConvUnify #######################
-add_unittest_without_exec(test_ConvUnify
-    test_ConvUnify.cpp
-    LayerGradUtil.cpp)
-
-add_test(NAME test_ConvUnify
-    COMMAND test_ConvUnify)
-################# test_BatchNorm #######################
-add_unittest_without_exec(test_BatchNorm
-    test_BatchNorm.cpp
-    LayerGradUtil.cpp)
-
-add_test(NAME test_BatchNorm
-    COMMAND test_BatchNorm)
-
-
-################# test_KmaxSeqScore #######################
-add_unittest_without_exec(test_KmaxSeqScore
-    test_KmaxSeqScore.cpp
-    LayerGradUtil.cpp)
-
-add_test(NAME test_KmaxSeqScore
-    COMMAND test_KmaxSeqScore)
-
-if(NOT MOBILE_INFERENCE)
-################## test_Evaluator #######################
-    add_unittest(test_Evaluator
-        test_Evaluator.cpp)
-endif()
-
-################ test_LinearChainCRF ####################
-add_simple_unittest(test_LinearChainCRF)
-
-############## test_MultinomialSampler ###################
-add_simple_unittest(test_MultinomialSampler)
-
 ############## test_PyDataProvider ########################
 if(WITH_PYTHON)
     add_unittest_without_exec(test_PyDataProvider
@@ -125,9 +47,6 @@ if(WITH_PYTHON)
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
-############### test_RecurrentLayer #######################
-add_simple_unittest(test_RecurrentLayer)
-
 ############### test_WarpCTCLayer #######################
 if(NOT WITH_DOUBLE)
     add_unittest_without_exec(test_WarpCTCLayer
@@ -139,19 +58,33 @@ if(NOT WITH_DOUBLE)
 endif()
 
 if(NOT MOBILE_INFERENCE)
-############### test_RecurrentGradientMachine ###############
-  # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
-  # I will fix it.
-  add_unittest_without_exec(test_RecurrentGradientMachine
-      test_RecurrentGradientMachine.cpp)
-  add_test(NAME test_RecurrentGradientMachine
-      COMMAND .set_python_path.sh -d
-              ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-              ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-endif()
+################### test_ProtoDataProvider ############
+    add_unittest_without_exec(test_ProtoDataProvider
+        test_ProtoDataProvider.cpp)
 
-if(NOT MOBILE_INFERENCE)
+    # test_ProtoDataProvider will mkdir as same name,
+    # so if WORKING_DIRECTORY is default directory, then
+    # mkdir will get error.
+    add_test(NAME test_ProtoDataProvider
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+
+################## test_Evaluator #######################
+    add_unittest(test_Evaluator
+        test_Evaluator.cpp)
+      
+############### test_RecurrentGradientMachine ###############
+    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
+    # I will fix it.
+    add_unittest_without_exec(test_RecurrentGradientMachine
+        test_RecurrentGradientMachine.cpp)
+    add_test(NAME test_RecurrentGradientMachine
+        COMMAND .set_python_path.sh -d
+                ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+                ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+      
+############### test_NetworkCompare ###############
     add_unittest_without_exec(test_NetworkCompare
         test_NetworkCompare.cpp)
     if(WITH_GPU)
diff --git a/paddle/gserver/tests/test_Expand.cpp b/paddle/gserver/tests/test_Expand.cpp
new file mode 100644
index 0000000000..a84a518a01
--- /dev/null
+++ b/paddle/gserver/tests/test_Expand.cpp
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+// Do one forward pass of expand layer and check to see if its output
+// matches the given result.(Test onlyCPU currently.)
+void doOneExpandTest(string trans_type,
+                     bool hasSubseq,
+                     bool useGpu,
+                     Argument& input1,
+                     Argument& input2,
+                     Argument& result) {
+  FLAGS_use_gpu = false;
+  // Setting up the expand layer
+  TestConfig config;
+  config.layerConfig.set_type("expand");
+
+  auto inputType1 =
+      trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA;
+  config.inputDefs.push_back({inputType1, "layer0", 1, 0});
+  auto inputType2 =
+      hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA;
+
+  config.inputDefs.push_back({inputType2, "layer1", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      config, &dataLayers, &datas, &layerMap, "expand", 1, false, useGpu);
+  dataLayers[0]->getOutput() = input1;
+  dataLayers[1]->getOutput() = input2;
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr expandLayer;
+  initTestLayer(config, &layerMap, &parameters, &expandLayer);
+  expandLayer->forward(PASS_GC);
+  checkMatrixEqual(expandLayer->getOutputValue(), result.value);
+}
+
+TEST(Layer, ExpandLayerFwd) {
+  bool useGpu = false;
+
+  // Assume batch_size =3 in all cases.
+
+  // CPU case 1. non-seq expand to seq
+  // input1 = 1,2,3
+  // input2 = [4,5],[6],[7,8,9]
+  // result = [1,1],[2],[3,3,3]
+  Argument input1, input2, result;
+  input1.value = Matrix::create(3, 1, false, useGpu);
+  real input1Data[] = {1, 2, 3};
+  input1.value->setData(input1Data);
+
+  input2.value = Matrix::create(6, 1, false, useGpu);
+  real input2Data[] = {4, 5, 6, 7, 8, 9};
+  input2.value->setData(input2Data);
+  input2.sequenceStartPositions = ICpuGpuVector::create(4, useGpu);
+  int input2Seq[] = {0, 2, 3, 6};
+  input2.sequenceStartPositions->copyFrom(input2Seq, 4, useGpu);
+
+  result.value = Matrix::create(6, 1, false, useGpu);
+  real resultData[] = {1, 1, 2, 3, 3, 3};
+  result.value->setData(resultData);
+
+  doOneExpandTest("non-seq", false, useGpu, input1, input2, result);
+
+  // CPU case 2. non-seq expand to sub-seq
+  // input1 = 1,2,3
+  // input2 = [[4,5]],[[6]],[[7],[8,9]]
+  // result = [[1,1]],[[2]],[[3],[3,3]]
+  input2.subSequenceStartPositions = ICpuGpuVector::create(5, useGpu);
+  int input2SubSeq[] = {0, 2, 3, 4, 6};
+  input2.subSequenceStartPositions->copyFrom(input2SubSeq, 5, useGpu);
+
+  doOneExpandTest("non-seq", true, useGpu, input1, input2, result);
+
+  // CPU case 3. seq expand to sub-seq
+  // input1 = [1,2],[3],[4]
+  // input2 = [[4,5]],[[6]],[[7],[8,9]]
+  // result = [[1,1]],[[2]],[[3],[4,4]]
+  Matrix::resizeOrCreate(input1.value, 4, 1, false, useGpu);
+  real input1Data_case3[] = {1, 2, 3, 4};
+  input1.value->setData(input1Data_case3);
+
+  input1.sequenceStartPositions = ICpuGpuVector::create(4, useGpu);
+  int input1Seq[] = {0, 2, 3, 4};
+  input1.sequenceStartPositions->copyFrom(input1Seq, 4, useGpu);
+
+  real resultData_case3[] = {1, 1, 2, 3, 4, 4};
+  result.value->setData(resultData_case3);
+
+  doOneExpandTest("seq", true, useGpu, input1, input2, result);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}

From 5200c657a7899bde418afecf90f0536c1702e089 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 09:05:03 -0700
Subject: [PATCH 31/81] "move Tensor to LoDTensor"

---
 paddle/operators/nccl_op.cc      |   7 +
 paddle/operators/nccl_op.cu      |  20 ++-
 paddle/operators/nccl_op.h       |  50 --------
 paddle/operators/nccl_op_test.cu | 214 +++++++++++++++++++++++--------
 4 files changed, 186 insertions(+), 105 deletions(-)
 delete mode 100644 paddle/operators/nccl_op.h

diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index ec7a89d5ff..85f589f4aa 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -74,8 +74,15 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
     //                 reduction == "ncclMin" || reduction == "ncclMax"),
     //                "invalid reduction.");
 
+    // auto in_dim = x_dims[0];
     ctx->SetOutputsDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
+    size_t N = x_dims.size();
+    auto out_dims = ctx->GetOutputsDim("Out");
+    for (size_t i = 0; i < N; ++i) {
+      VLOG(1) << " inference (X) " << framework::product(x_dims[i]) << " (Out)"
+              << framework::product(out_dims[i]);
+    }
   }
 };
 
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index 4fbdf1ce02..c507d325f2 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -12,6 +12,7 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include <functional>
 
+#include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 
@@ -20,6 +21,7 @@ namespace operators {
 
 using framework::Tensor;
 using platform::Communicator;
+using framework::LoDTensor;
 
 template <typename Type>
 class NCCLTypeWrapper;
@@ -43,8 +45,8 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
 
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto outs = ctx.MultiOutput<Tensor>("Out");
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto outs = ctx.MultiOutput<LoDTensor>("Out");
 
     auto* comm = ctx.Input<Communicator>("Communicator");
 
@@ -56,12 +58,24 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
         boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(device_id);
 
+    size_t N = ins.size();
+    for (size_t i = 0; i < N; ++i) {
+      VLOG(1) << " inference (X) " << framework::product(ins[i]->dims())
+              << " (Out)" << framework::product(outs[i]->dims());
+    }
+
     for (size_t i = 0; i < ins.size(); ++i) {
+      VLOG(1) << " invoke allreduce. send " << ins[i]->numel() << " recv "
+              << outs[i]->numel();
+
       PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
           ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
-          outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type, ncclSum,
+          outs[i]->numel(), NCCLTypeWrapper<T>::type, ncclSum,
           comm->comms_[idx], stream));
       PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+      VLOG(1) << " finished allreduce. send " << ins[i]->numel() << " recv "
+              << outs[i]->numel();
     }
   }
 };
diff --git a/paddle/operators/nccl_op.h b/paddle/operators/nccl_op.h
deleted file mode 100644
index a438e4eaa2..0000000000
--- a/paddle/operators/nccl_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-
-#include <string.h>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using platform::Communicator;
-
-template <typename Type>
-class NCCLTypeWrapper;
-
-template <>
-class NCCLTypeWrapper<float> {
- public:
-  static const ncclDataType_t type = ncclFloat;
-};
-
-template <>
-class NCCLTypeWrapper<double> {
- public:
-  static const ncclDataType_t type = ncclDouble;
-};
-
-template <typename T>
-class NCCLInitKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<int> gpus = ctx.Attr<std::vector<int>>("gpus");
-    auto* comm = ctx.Output<Communicator>("Communicator");
-    comm->InitAll(gpus);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 334884d657..0509e6ddab 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -12,101 +12,211 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#define EIGEN_USE_GPU
+
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
-#include <memory>
+#include <mutex>
+#include <thread>
+#include <utility>
 #include <vector>
 
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/var_desc.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
-USE_CPU_ONLY_OP(ncclInit);
+#include "paddle/framework/op_registry.h"
+
+USE_NO_KERNEL_OP(ncclInit);
 USE_GPU_ONLY_OP(ncclAllReduce);
 USE_GPU_ONLY_OP(ncclReduce);
 USE_GPU_ONLY_OP(ncclBcastSend);
 USE_GPU_ONLY_OP(ncclBcastRecv);
 
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
 static std::vector<int> gpu_list;
 
-namespace f = paddle::framework;
-namespace ops = paddle::operators;
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
-    }
+// ncclInitOp with desc
+// TEST(NCCL, ncclInitOp) {
+//   f::ProgramDescBind program;
+//   f::BlockDescBind *block = program.Block(0);
+//   f::OpDescBind *op_desc = block->AppendOp();
+
+//   op_desc->SetType("ncclInit");
+//   op_desc->SetOutput("Communicator", {"x1"});
+//   op_desc->SetAttr("gpus", {gpu_list});
+//   f::Scope g_scope;
+//   p::DeviceContext *ctx =
+//       new p::CPUDeviceContext(p::CPUPlace());
+
+//   auto *var = g_scope.Var("x1");
+//   var->GetMutable<p::Communicator>();
+
+//   auto op = f::OpRegistry::CreateOp(*op_desc);
+//   VLOG(1) << "invoke NCCLInitOp.";
+//   op->Run(g_scope, *ctx);
+//   VLOG(1) << "NCCLInitOp finished.";
+// }
+
+// test data amount
+static const f::DDim kDims = {100, 100};
+static std::vector<p::DeviceContext *> dev_ctxs;
+
+void CreateContext() {
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    p::GPUPlace place(i);
+    VLOG(1) << "create devicecontext : " << i;
+    dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
   }
+}
 
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
+void DestroyContext() {
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    delete dev_ctxs[i];
   }
-  op->SetAttrMap(attrs);
 }
 
-// ncclInitOp with desc
-TEST(NCCL, ncclInitOp) {
+// global scope
+static f::Scope g_scope;
+std::mutex mu;
+
+template <class T>
+void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) {
+  std::unique_lock<std::mutex> lk(mu);
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.Block(0);
-  f::OpDescBind *op_desc = block->AppendOp();
-
-  op_desc->SetType("ncclInit");
-  op_desc->SetOutput("Communicator", {"x1"});
-  op_desc->SetAttr("gpus", {gpu_list});
-  f::Scope g_scope;
-  paddle::platform::DeviceContext *ctx =
-      new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
-
-  auto *var = g_scope.Var("x1");
-  var->GetMutable<paddle::platform::Communicator>();
-
-  auto op = f::OpRegistry::CreateOp(*op_desc);
-  VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx);
-  VLOG(1) << "NCCLInitOp finished.";
+  f::OpDescBind *op1 = block->AppendOp();
+  *op1 = op_desc;
+
+  p::GPUPlace place(gpu_id);
+  // p::DeviceContext *ctx =
+  //     new p::CUDADeviceContext(place);
+  p::DeviceContext *ctx = dev_ctxs.at(gpu_id);
+  VLOG(1) << "device context : " << dev_ctxs.size() << " gpu_id " << gpu_id;
+
+  // f::Scope &local_scope = g_scope.NewScope();
+
+  auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
+  auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
+  send_tensor->Resize(kDims);
+  send_tensor->mutable_data<T>(kDims, place);
+  // recv_tensor->mutable_data<T>(kDims, place);
+
+  std::vector<T> send_vector(f::product(kDims), gpu_id);
+  send_tensor->CopyFromVector<T>(send_vector, *ctx);
+  lk.unlock();
+  PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
+                 "Tensor numel not match!");
+  ctx->Wait();
+
+  VLOG(1) << send_tensor->numel() << " element in send tensor";
+
+  auto op = f::OpRegistry::CreateOp(*op1);
+  VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
+  op->Run(*scope, *ctx);
+  VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
 }
 
 // ncclAllReduceOp with desc
-TEST(NCCL, ncclInitOp) {
+TEST(NCCL, ncclAllReduceOp) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.Block(0);
-  f::OpDescBind *op_desc = block->AppendOp();
+  f::OpDescBind *op1 = block->AppendOp();
 
-  op_desc->SetType("ncclAllReduce");
+  p::DeviceContext *ctx = new p::CPUDeviceContext(p::CPUPlace());
 
-  op_desc->SetOutput("Communicator", {"x1"});
-  op_desc->SetAttr("gpus", {gpu_list});
-  f::Scope g_scope;
-  paddle::platform::DeviceContext *ctx =
-      new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
+  CreateContext();
 
-  auto *var = g_scope.Var("x1");
-  var->GetMutable<paddle::platform::Communicator>();
+  op1->SetType("ncclInit");
+  op1->SetOutput("Communicator", {"comm"});
+  op1->SetAttr("gpus", {gpu_list});
 
-  auto op = f::OpRegistry::CreateOp(*op_desc);
+  auto *var = g_scope.Var("comm");
+  var->GetMutable<p::Communicator>();
+
+  auto op = f::OpRegistry::CreateOp(*op1);
   VLOG(1) << "invoke NCCLInitOp.";
   op->Run(g_scope, *ctx);
   VLOG(1) << "NCCLInitOp finished.";
+  delete ctx;
+
+  f::OpDescBind *op2 = new f::OpDescBind;
+  op2->SetType("ncclAllReduce");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+
+  std::vector<std::thread> ths;
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    std::thread th(DeviceProgram<float>, gpu_list[i], *op2,
+                   &g_scope.NewScope());
+    // std::thread th([=](){
+    //     VLOG(1) << "thread id created : " << i;
+    //     return 1;});
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    VLOG(1) << " thread joined! " << i;
+    ths[i].join();
+  }
+  VLOG(1) << " main thread joined!";
+
+  delete op2;
+  g_scope.~Scope();
+  DestroyContext();
+  VLOG(1) << " destory contexts";
 }
 
+// ncclBcastOp with desc
+// TEST(NCCL, ncclBcastOp) {
+//   f::ProgramDescBind program;
+//   f::BlockDescBind *block = program.Block(0);
+//   f::OpDescBind *op1= block->AppendOp();
+
+//   p::DeviceContext *ctx =
+//     new p::CPUDeviceContext(p::CPUPlace());
+
+//   op1->SetType("ncclInit");
+//   op1->SetOutput("Communicator", {"comm"});
+//   op1->SetAttr("gpus", {gpu_list});
+
+//   auto *var = g_scope.Var("comm");
+//   var->GetMutable<p::Communicator>();
+
+//   auto op = f::OpRegistry::CreateOp(*op1);
+//   VLOG(1) << "invoke NCCLInitOp.";
+//   op->Run(g_scope, *ctx);
+//   VLOG(1) << "NCCLInitOp finished.";
+
+//   f::OpDescBind *op2 = new f::OpDescBind;
+//   op2->SetType("ncclBcastSend");
+//   op2->SetInput("X", {"st"});
+//   op2->SetInput("Communicator", {"comm"});
+//   op2->SetOutput("Out", {"rt"});
+
+//   std::vector<std::thread> ths;
+//   for (size_t i=0; i < gpu_list.size(); ++i) {
+//     std::thread th(DeviceProgram<float>, gpu_list[i], *op2);
+//     ths.emplace_back(std::move(th));
+//   }
+
+//   for (size_t i=0; i < gpu_list.size(); ++i) {
+//     ths[i].join();
+//   }
+// }
+
 int main(int argc, char **argv) {
-  static int dev_count = paddle::platform::GetCUDADeviceCount();
+  const int dev_count = p::GetCUDADeviceCount();
   if (dev_count <= 1) {
     LOG(WARNING)
         << "Cannot test multi-gpu nccl, because the CUDA device count is "

From 6d1493a46080eb6967f1ff9877e3c479153dd638 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 09:24:55 -0700
Subject: [PATCH 32/81] "add bcast c++ test case"

---
 paddle/operators/nccl_op.cc      |   7 --
 paddle/operators/nccl_op.cu      |  17 +--
 paddle/operators/nccl_op_test.cu | 208 ++++++++++++++++---------------
 3 files changed, 115 insertions(+), 117 deletions(-)

diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 85f589f4aa..ec7a89d5ff 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -74,15 +74,8 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
     //                 reduction == "ncclMin" || reduction == "ncclMax"),
     //                "invalid reduction.");
 
-    // auto in_dim = x_dims[0];
     ctx->SetOutputsDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
-    size_t N = x_dims.size();
-    auto out_dims = ctx->GetOutputsDim("Out");
-    for (size_t i = 0; i < N; ++i) {
-      VLOG(1) << " inference (X) " << framework::product(x_dims[i]) << " (Out)"
-              << framework::product(out_dims[i]);
-    }
   }
 };
 
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index c507d325f2..68d0d5b7c9 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -58,12 +58,6 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
         boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(device_id);
 
-    size_t N = ins.size();
-    for (size_t i = 0; i < N; ++i) {
-      VLOG(1) << " inference (X) " << framework::product(ins[i]->dims())
-              << " (Out)" << framework::product(outs[i]->dims());
-    }
-
     for (size_t i = 0; i < ins.size(); ++i) {
       VLOG(1) << " invoke allreduce. send " << ins[i]->numel() << " recv "
               << outs[i]->numel();
@@ -87,8 +81,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
 
-    auto ins = ctx.MultiInput<Tensor>("X");  // x0, x1, x2
-    auto outs = ctx.MultiOutput<Tensor>("Out");
+    auto ins = ctx.MultiInput<LoDTensor>("X");  // x0, x1, x2
+    auto outs = ctx.MultiOutput<LoDTensor>("Out");
 
     auto* comm = ctx.Input<Communicator>("Communicator");
 
@@ -108,10 +102,17 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
       if (root == device_id) {
         recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
       }
+
+      VLOG(1) << " invoke reduce. send " << ins[i]->numel() << " recv "
+              << outs[i]->numel();
+
       PADDLE_ENFORCE(platform::dynload::ncclReduce(
           ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
           NCCLTypeWrapper<T>::type, ncclSum, root, comm->comms_[idx], stream));
       PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+      VLOG(1) << " finished reduce. send " << ins[i]->numel() << " recv "
+              << outs[i]->numel();
     }
   }
 };
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 0509e6ddab..0e64802f17 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -16,7 +16,7 @@
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <thrust/device_vector.h>
+#include <memory>
 #include <mutex>
 #include <thread>
 #include <utility>
@@ -24,6 +24,7 @@
 
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_desc.h"
+#include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/var_desc.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
@@ -32,8 +33,6 @@
 #include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
-#include "paddle/framework/op_registry.h"
-
 USE_NO_KERNEL_OP(ncclInit);
 USE_GPU_ONLY_OP(ncclAllReduce);
 USE_GPU_ONLY_OP(ncclReduce);
@@ -44,51 +43,31 @@ namespace f = paddle::framework;
 namespace p = paddle::platform;
 
 static std::vector<int> gpu_list;
+static std::vector<std::unique_ptr<p::DeviceContext>> dev_ctxs;
+std::mutex mu;
+
+// test data amount
+const f::DDim kDims = {100, 100};
 
 // ncclInitOp with desc
-// TEST(NCCL, ncclInitOp) {
-//   f::ProgramDescBind program;
-//   f::BlockDescBind *block = program.Block(0);
-//   f::OpDescBind *op_desc = block->AppendOp();
-
-//   op_desc->SetType("ncclInit");
-//   op_desc->SetOutput("Communicator", {"x1"});
-//   op_desc->SetAttr("gpus", {gpu_list});
-//   f::Scope g_scope;
-//   p::DeviceContext *ctx =
-//       new p::CPUDeviceContext(p::CPUPlace());
-
-//   auto *var = g_scope.Var("x1");
-//   var->GetMutable<p::Communicator>();
-
-//   auto op = f::OpRegistry::CreateOp(*op_desc);
-//   VLOG(1) << "invoke NCCLInitOp.";
-//   op->Run(g_scope, *ctx);
-//   VLOG(1) << "NCCLInitOp finished.";
-// }
+TEST(NCCL, ncclInitOp) {
+  std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
 
-// test data amount
-static const f::DDim kDims = {100, 100};
-static std::vector<p::DeviceContext *> dev_ctxs;
+  op_desc->SetType("ncclInit");
+  op_desc->SetOutput("Communicator", {"x1"});
+  op_desc->SetAttr("gpus", {gpu_list});
+  f::Scope g_scope;
+  p::DeviceContext *ctx = new p::CPUDeviceContext(p::CPUPlace());
 
-void CreateContext() {
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    p::GPUPlace place(i);
-    VLOG(1) << "create devicecontext : " << i;
-    dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
-  }
-}
+  auto *var = g_scope.Var("x1");
+  var->GetMutable<p::Communicator>();
 
-void DestroyContext() {
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    delete dev_ctxs[i];
-  }
+  auto op = f::OpRegistry::CreateOp(*op_desc);
+  VLOG(1) << "invoke NCCLInitOp.";
+  op->Run(g_scope, *ctx);
+  VLOG(1) << "NCCLInitOp finished.";
 }
 
-// global scope
-static f::Scope g_scope;
-std::mutex mu;
-
 template <class T>
 void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) {
   std::unique_lock<std::mutex> lk(mu);
@@ -98,18 +77,12 @@ void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) {
   *op1 = op_desc;
 
   p::GPUPlace place(gpu_id);
-  // p::DeviceContext *ctx =
-  //     new p::CUDADeviceContext(place);
-  p::DeviceContext *ctx = dev_ctxs.at(gpu_id);
-  VLOG(1) << "device context : " << dev_ctxs.size() << " gpu_id " << gpu_id;
-
-  // f::Scope &local_scope = g_scope.NewScope();
+  auto ctx = dev_ctxs.at(gpu_id);
 
   auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
   auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
   send_tensor->Resize(kDims);
   send_tensor->mutable_data<T>(kDims, place);
-  // recv_tensor->mutable_data<T>(kDims, place);
 
   std::vector<T> send_vector(f::product(kDims), gpu_id);
   send_tensor->CopyFromVector<T>(send_vector, *ctx);
@@ -118,7 +91,7 @@ void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) {
                  "Tensor numel not match!");
   ctx->Wait();
 
-  VLOG(1) << send_tensor->numel() << " element in send tensor";
+  VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
 
   auto op = f::OpRegistry::CreateOp(*op1);
   VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
@@ -128,14 +101,10 @@ void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) {
 
 // ncclAllReduceOp with desc
 TEST(NCCL, ncclAllReduceOp) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.Block(0);
-  f::OpDescBind *op1 = block->AppendOp();
-
-  p::DeviceContext *ctx = new p::CPUDeviceContext(p::CPUPlace());
-
-  CreateContext();
+  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
+  std::unique_ptr<f::Scope> g_scope(new Scope);
 
+  std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
   op1->SetType("ncclInit");
   op1->SetOutput("Communicator", {"comm"});
   op1->SetAttr("gpus", {gpu_list});
@@ -149,7 +118,7 @@ TEST(NCCL, ncclAllReduceOp) {
   VLOG(1) << "NCCLInitOp finished.";
   delete ctx;
 
-  f::OpDescBind *op2 = new f::OpDescBind;
+  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
   op2->SetType("ncclAllReduce");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
@@ -159,61 +128,89 @@ TEST(NCCL, ncclAllReduceOp) {
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     std::thread th(DeviceProgram<float>, gpu_list[i], *op2,
                    &g_scope.NewScope());
-    // std::thread th([=](){
-    //     VLOG(1) << "thread id created : " << i;
-    //     return 1;});
     ths.emplace_back(std::move(th));
   }
 
   for (size_t i = 0; i < gpu_list.size(); ++i) {
-    VLOG(1) << " thread joined! " << i;
     ths[i].join();
   }
-  VLOG(1) << " main thread joined!";
+  g_scope->reset(nullptr);
+}
+
+// ncclReduceOp with desc
+TEST(NCCL, ncclReduceOp) {
+  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
+  std::unique_ptr<f::Scope> g_scope(new Scope);
+
+  std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
+  op1->SetType("ncclInit");
+  op1->SetOutput("Communicator", {"comm"});
+  op1->SetAttr("gpus", {gpu_list});
+
+  auto *var = g_scope.Var("comm");
+  var->GetMutable<p::Communicator>();
+
+  auto op = f::OpRegistry::CreateOp(*op1);
+  VLOG(1) << "invoke NCCLInitOp.";
+  op->Run(g_scope, *ctx);
+  VLOG(1) << "NCCLInitOp finished.";
+  delete ctx;
+
+  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  op2->SetType("ncclReduce");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
 
-  delete op2;
-  g_scope.~Scope();
-  DestroyContext();
-  VLOG(1) << " destory contexts";
+  std::vector<std::thread> ths;
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    std::thread th(DeviceProgram<float>, gpu_list[i], *op2,
+                   &g_scope.NewScope());
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+  g_scope->reset(nullptr);
 }
 
 // ncclBcastOp with desc
-// TEST(NCCL, ncclBcastOp) {
-//   f::ProgramDescBind program;
-//   f::BlockDescBind *block = program.Block(0);
-//   f::OpDescBind *op1= block->AppendOp();
-
-//   p::DeviceContext *ctx =
-//     new p::CPUDeviceContext(p::CPUPlace());
-
-//   op1->SetType("ncclInit");
-//   op1->SetOutput("Communicator", {"comm"});
-//   op1->SetAttr("gpus", {gpu_list});
-
-//   auto *var = g_scope.Var("comm");
-//   var->GetMutable<p::Communicator>();
-
-//   auto op = f::OpRegistry::CreateOp(*op1);
-//   VLOG(1) << "invoke NCCLInitOp.";
-//   op->Run(g_scope, *ctx);
-//   VLOG(1) << "NCCLInitOp finished.";
-
-//   f::OpDescBind *op2 = new f::OpDescBind;
-//   op2->SetType("ncclBcastSend");
-//   op2->SetInput("X", {"st"});
-//   op2->SetInput("Communicator", {"comm"});
-//   op2->SetOutput("Out", {"rt"});
-
-//   std::vector<std::thread> ths;
-//   for (size_t i=0; i < gpu_list.size(); ++i) {
-//     std::thread th(DeviceProgram<float>, gpu_list[i], *op2);
-//     ths.emplace_back(std::move(th));
-//   }
-
-//   for (size_t i=0; i < gpu_list.size(); ++i) {
-//     ths[i].join();
-//   }
-// }
+TEST(NCCL, ncclBcastOp) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+
+  p::DeviceContext *ctx = new p::CPUDeviceContext(p::CPUPlace());
+
+  op1->SetType("ncclInit");
+  op1->SetOutput("Communicator", {"comm"});
+  op1->SetAttr("gpus", {gpu_list});
+
+  auto *var = g_scope.Var("comm");
+  var->GetMutable<p::Communicator>();
+
+  auto op = f::OpRegistry::CreateOp(*op1);
+  VLOG(1) << "invoke NCCLInitOp.";
+  op->Run(g_scope, *ctx);
+  VLOG(1) << "NCCLInitOp finished.";
+
+  f::OpDescBind *op2 = new f::OpDescBind;
+  op2->SetType("ncclBcastSend");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+
+  std::vector<std::thread> ths;
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    std::thread th(DeviceProgram<float>, gpu_list[i], *op2);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+}
 
 int main(int argc, char **argv) {
   const int dev_count = p::GetCUDADeviceCount();
@@ -228,5 +225,12 @@ int main(int argc, char **argv) {
     gpu_list.emplace_back(i);
   }
   testing::InitGoogleTest(&argc, argv);
+
+  // device context should be release before scope.
+  // otherwise driver will down.
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    p::GPUPlace place(i);
+    dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
+  }
   return RUN_ALL_TESTS();
 }

From 11cf3e3a43e0d5527e7a4e2abab2836aaa2d0338 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 10:50:00 -0700
Subject: [PATCH 33/81] "refactorization of nccl test case"

---
 paddle/operators/nccl_op_test.cu | 235 +++++++++++++++----------------
 1 file changed, 111 insertions(+), 124 deletions(-)

diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 0e64802f17..8c54a3dcba 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -43,81 +43,107 @@ namespace f = paddle::framework;
 namespace p = paddle::platform;
 
 static std::vector<int> gpu_list;
-static std::vector<std::unique_ptr<p::DeviceContext>> dev_ctxs;
-std::mutex mu;
 
 // test data amount
 const f::DDim kDims = {100, 100};
 
-// ncclInitOp with desc
-TEST(NCCL, ncclInitOp) {
-  std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
+// nccl op common tester, init communicator.
+class NCCLTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    cpu_ctx = new p::CPUDeviceContext(p::CPUPlace());
+    for (size_t i = 0; i < gpu_list.size(); ++i) {
+      p::GPUPlace place(i);
+      dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
+    }
+
+    NCCLInitOp();
+  }
 
-  op_desc->SetType("ncclInit");
-  op_desc->SetOutput("Communicator", {"x1"});
-  op_desc->SetAttr("gpus", {gpu_list});
-  f::Scope g_scope;
-  p::DeviceContext *ctx = new p::CPUDeviceContext(p::CPUPlace());
+  virtual void TearDown() override {
+    for (auto &device_context : dev_ctxs) {
+      delete device_context;
+    }
+  }
 
-  auto *var = g_scope.Var("x1");
-  var->GetMutable<p::Communicator>();
+  void NCCLInitOp() {
+    std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
 
-  auto op = f::OpRegistry::CreateOp(*op_desc);
-  VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx);
-  VLOG(1) << "NCCLInitOp finished.";
-}
+    op1->SetType("ncclInit");
+    op1->SetOutput("Communicator", {"comm"});
+    op1->SetAttr("gpus", {gpu_list});
 
-template <class T>
-void DeviceProgram(int gpu_id, const f::OpDescBind &op_desc, f::Scope *scope) {
-  std::unique_lock<std::mutex> lk(mu);
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.Block(0);
-  f::OpDescBind *op1 = block->AppendOp();
-  *op1 = op_desc;
-
-  p::GPUPlace place(gpu_id);
-  auto ctx = dev_ctxs.at(gpu_id);
-
-  auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
-  auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
-  send_tensor->Resize(kDims);
-  send_tensor->mutable_data<T>(kDims, place);
-
-  std::vector<T> send_vector(f::product(kDims), gpu_id);
-  send_tensor->CopyFromVector<T>(send_vector, *ctx);
-  lk.unlock();
-  PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
-                 "Tensor numel not match!");
-  ctx->Wait();
-
-  VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
-
-  auto op = f::OpRegistry::CreateOp(*op1);
-  VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
-  op->Run(*scope, *ctx);
-  VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
-}
+    auto *var = g_scope.Var("comm");
+    var->GetMutable<p::Communicator>();
 
-// ncclAllReduceOp with desc
-TEST(NCCL, ncclAllReduceOp) {
-  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
-  std::unique_ptr<f::Scope> g_scope(new Scope);
+    auto op = f::OpRegistry::CreateOp(*op1);
+    VLOG(1) << "invoke NCCLInitOp.";
+    op->Run(g_scope, *cpu_ctx);
+    VLOG(1) << "NCCLInitOp finished.";
+  }
+
+  template <class T>
+  void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc,
+                        f::Scope *scope) {
+    std::unique_lock<std::mutex> lk(mu);
+    f::ProgramDescBind program;
+    f::BlockDescBind *block = program.Block(0);
+    f::OpDescBind *op1 = block->AppendOp();
+    *op1 = op_desc;
+
+    p::GPUPlace place(gpu_id);
+    auto &ctx = dev_ctxs.at(gpu_id);
+
+    auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
+    auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
+    send_tensor->Resize(kDims);
+    send_tensor->mutable_data<T>(kDims, place);
+
+    std::vector<T> send_vector(f::product(kDims), gpu_id);
+    send_tensor->CopyFromVector<T>(send_vector, *ctx);
+    lk.unlock();
+    PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
+                   "Tensor numel not match!");
+    ctx->Wait();
+
+    VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
+
+    auto op = f::OpRegistry::CreateOp(*op1);
+    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
+    op->Run(*scope, *ctx);
+    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
+  }
 
-  std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
-  op1->SetType("ncclInit");
-  op1->SetOutput("Communicator", {"comm"});
-  op1->SetAttr("gpus", {gpu_list});
+ public:
+  std::vector<p::DeviceContext *> dev_ctxs;
+  p::DeviceContext *cpu_ctx;
+  f::Scope g_scope;
+  std::mutex mu;
+};
+
+// ncclInitOp with desc
+// TEST(NCCL, ncclInitOp) {
+//   std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
+
+//   op_desc->SetType("ncclInit");
+//   op_desc->SetOutput("Communicator", {"x1"});
+//   op_desc->SetAttr("gpus", {gpu_list});
+
+//   f::Scope g_scope;
+//   std::unique_ptr<p::DeviceContext> ctx(new
+//   p::CPUDeviceContext(p::CPUPlace()));
 
-  auto *var = g_scope.Var("comm");
-  var->GetMutable<p::Communicator>();
+//   auto *var = g_scope.Var("x1");
+//   var->GetMutable<p::Communicator>();
 
-  auto op = f::OpRegistry::CreateOp(*op1);
-  VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx);
-  VLOG(1) << "NCCLInitOp finished.";
-  delete ctx;
+//   auto op = f::OpRegistry::CreateOp(*op_desc);
+//   VLOG(1) << "invoke NCCLInitOp.";
+//   op->Run(g_scope, *ctx.get());
+//   VLOG(1) << "NCCLInitOp finished.";
+// }
 
+// ncclAllReduceOp with desc
+TEST_F(NCCLTester, ncclAllReduceOp) {
   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
   op2->SetType("ncclAllReduce");
   op2->SetInput("X", {"st"});
@@ -126,36 +152,18 @@ TEST(NCCL, ncclAllReduceOp) {
 
   std::vector<std::thread> ths;
   for (size_t i = 0; i < gpu_list.size(); ++i) {
-    std::thread th(DeviceProgram<float>, gpu_list[i], *op2,
-                   &g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), &g_scope.NewScope());
     ths.emplace_back(std::move(th));
   }
 
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     ths[i].join();
   }
-  g_scope->reset(nullptr);
 }
 
 // ncclReduceOp with desc
 TEST(NCCL, ncclReduceOp) {
-  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
-  std::unique_ptr<f::Scope> g_scope(new Scope);
-
-  std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
-  op1->SetType("ncclInit");
-  op1->SetOutput("Communicator", {"comm"});
-  op1->SetAttr("gpus", {gpu_list});
-
-  auto *var = g_scope.Var("comm");
-  var->GetMutable<p::Communicator>();
-
-  auto op = f::OpRegistry::CreateOp(*op1);
-  VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx);
-  VLOG(1) << "NCCLInitOp finished.";
-  delete ctx;
-
   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
   op2->SetType("ncclReduce");
   op2->SetInput("X", {"st"});
@@ -164,53 +172,36 @@ TEST(NCCL, ncclReduceOp) {
 
   std::vector<std::thread> ths;
   for (size_t i = 0; i < gpu_list.size(); ++i) {
-    std::thread th(DeviceProgram<float>, gpu_list[i], *op2,
-                   &g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), &g_scope.NewScope());
     ths.emplace_back(std::move(th));
   }
 
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     ths[i].join();
   }
-  g_scope->reset(nullptr);
 }
 
 // ncclBcastOp with desc
-TEST(NCCL, ncclBcastOp) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.Block(0);
-  f::OpDescBind *op1 = block->AppendOp();
-
-  p::DeviceContext *ctx = new p::CPUDeviceContext(p::CPUPlace());
-
-  op1->SetType("ncclInit");
-  op1->SetOutput("Communicator", {"comm"});
-  op1->SetAttr("gpus", {gpu_list});
-
-  auto *var = g_scope.Var("comm");
-  var->GetMutable<p::Communicator>();
-
-  auto op = f::OpRegistry::CreateOp(*op1);
-  VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx);
-  VLOG(1) << "NCCLInitOp finished.";
-
-  f::OpDescBind *op2 = new f::OpDescBind;
-  op2->SetType("ncclBcastSend");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-
-  std::vector<std::thread> ths;
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    std::thread th(DeviceProgram<float>, gpu_list[i], *op2);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    ths[i].join();
-  }
-}
+// TEST(NCCL, ncclBcastOp) {
+//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+//   op2->SetType("ncclBcastSend");
+//   op2->SetInput("X", {"st"});
+//   op2->SetInput("Communicator", {"comm"});
+//   op2->SetOutput("Out", {"rt"});
+
+//   std::vector<std::thread> ths;
+//   for (size_t i = 0; i < gpu_list.size(); ++i) {
+//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+//     *op2.get(),
+//                    &g_scope.NewScope());
+//     ths.emplace_back(std::move(th));
+//   }
+
+//   for (size_t i = 0; i < gpu_list.size(); ++i) {
+//     ths[i].join();
+//   }
+// }
 
 int main(int argc, char **argv) {
   const int dev_count = p::GetCUDADeviceCount();
@@ -228,9 +219,5 @@ int main(int argc, char **argv) {
 
   // device context should be release before scope.
   // otherwise driver will down.
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    p::GPUPlace place(i);
-    dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
-  }
   return RUN_ALL_TESTS();
 }

From 94992a990b2716d19427b4758060a5196baf1c56 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 12:55:14 -0700
Subject: [PATCH 34/81] "add multiop testcase"

---
 paddle/operators/nccl_op.cc      |  4 ++
 paddle/operators/nccl_op_test.cu | 84 ++++++++++++++++++++++++++++++--
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index ec7a89d5ff..5b6c9bec70 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -93,6 +93,10 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
         " Input(Communicator) of Reduce op input should not be NULL");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    " Input(X) of Reduce op input should not be NULL");
+
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 8c54a3dcba..0eda0c6b57 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -16,6 +16,7 @@
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <algorithm>
 #include <memory>
 #include <mutex>
 #include <thread>
@@ -150,16 +151,41 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
 
+  std::vector<f::Scope *> dev_scopes;
+
   std::vector<std::thread> ths;
+
   for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), &g_scope.NewScope());
+                   *op2.get(), dev_scopes[i]);
     ths.emplace_back(std::move(th));
   }
 
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     ths[i].join();
   }
+
+  // check results
+  float result = 0;
+  std::accumulate(gpu_list.begin(), gpu_list.end(), result);
+  for (size_t i = 0; i < dev_scopes.size(); ++i) {
+    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
+    auto *rt = recv_tensor.data<float>();
+
+    p::CPUPlace cpu_place;
+    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
+    result_tensor->Resize(kDims);
+    auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+    paddle::memory::Copy(
+        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+        recv_tensor.numel() * sizeof(float),
+        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+    for (size_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], result, 1e-5);
+    }
+  }
 }
 
 // ncclReduceOp with desc
@@ -170,24 +196,76 @@ TEST(NCCL, ncclReduceOp) {
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
 
+  std::vector<f::Scope *> dev_scopes;
+
   std::vector<std::thread> ths;
   for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), &g_scope.NewScope());
+                   *op2.get(), dev_scopes[i]);
     ths.emplace_back(std::move(th));
   }
 
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     ths[i].join();
   }
+
+  // check results
+  float result = 0;
+  std::accumulate(gpu_list.begin(), gpu_list.end(), result);
+  for (size_t i = 0; i < dev_scopes.size(); ++i) {
+    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
+    auto *rt = recv_tensor.data<float>();
+
+    p::CPUPlace cpu_place;
+    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
+    result_tensor->Resize(kDims);
+    auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+    paddle::memory::Copy(
+        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+        recv_tensor.numel() * sizeof(float),
+        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+    for (size_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], result, 1e-5);
+    }
+  }
 }
 
 // ncclBcastOp with desc
-// TEST(NCCL, ncclBcastOp) {
+TEST(NCCL, ncclBcastOp) {
+  std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
+  op1->SetType("ncclBcastSend");
+  op1->SetInput("X", {"st"});
+  op1->SetInput("Communicator", {"comm"});
+
+  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  op2->SetType("ncclBcastRecv");
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+
+  std::vector<std::thread> ths;
+  for (size_t i = 1; i < gpu_list.size(); ++i) {
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), &g_scope.NewScope());
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+}
+
+// joint ncclBcastOp and ncclReduceOp
+// TEST(NCCL, MultipleOp) {
 //   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
 //   op2->SetType("ncclBcastSend");
 //   op2->SetInput("X", {"st"});
 //   op2->SetInput("Communicator", {"comm"});
+
+//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+//   op2->SetType("ncclBcastRecv");
+//   op2->SetInput("Communicator", {"comm"});
 //   op2->SetOutput("Out", {"rt"});
 
 //   std::vector<std::thread> ths;

From 38d3adfeb6683ef3b2c579fa55264ea5c20b5201 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 15:02:37 -0700
Subject: [PATCH 35/81] "add multioperator testcase"

---
 paddle/operators/nccl_op.cc      |  71 ++++------
 paddle/operators/nccl_op.cu      |  13 +-
 paddle/operators/nccl_op_test.cu | 217 +++++++++++++++++++++----------
 3 files changed, 180 insertions(+), 121 deletions(-)

diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 5b6c9bec70..67bcc419fa 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -100,8 +100,8 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
   }
 };
 
-// BcastSendOp
-class NCCLBcastSendOp : public framework::OperatorWithKernel {
+// BcastOp
+class NCCLBcastOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -111,20 +111,12 @@ class NCCLBcastSendOp : public framework::OperatorWithKernel {
                    " Input(X) of Bcast op input should not be NULL");
     PADDLE_ENFORCE(ctx->HasInput("Communicator"),
                    " Input(Communicator) of Bcast op input should not be NULL");
-  }
-};
-
-// BcastRecvOp
-class NCCLBcastRecvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
-                   " Input(Communicator) of Bcast op input should not be NULL");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    " Output(Out) of Bcast op output should not be NULL");
+
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -146,52 +138,41 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-// BcastSend should be in the root
-// BcastSendOp
-class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
+// ReduceOp
+class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLBcastSendOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  NCCLReduceOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of BcastSend op");
+    AddInput("X", "The input of Reduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
-    AddAttr<int>("root", "root gpu of Bcast");
+    AddOutput("Out", "The output of Reduce op");
+    AddAttr<int>("root",
+                 "root gpu of the parameter. if not set(-1). hashed by name.")
+        .SetDefault(-1);
     AddComment(R"DOC(
-            Bcast the tensors.
-        )DOC");
+            Reduce the tensors)DOC");
   }
 };
 
 // BcastOp
-class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLBcastRecvOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  NCCLBcastOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of BcastSend op");
     AddInput("Communicator", "Communicator for communicating between gpus");
-    AddAttr<int>("root", "root gpu of BcastRecv");
     AddOutput("Out", "The output of Bcast");
+    AddAttr<int>("root",
+                 "root gpu of the parameter. if not set(-1). hashed by name.")
+        .SetDefault(-1);
     AddComment(R"DOC(
             Bcast the tensors.
         )DOC");
   }
 };
 
-// BcastRecvOp
-class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NCCLReduceOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of Reduce op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of Reduce op");
-    AddComment(R"DOC(
-            Reduce the tensors.
-        )DOC");
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -201,9 +182,7 @@ REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp,
 
 REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
                              ops::NCCLAllReduceOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclBcastSend, ops::NCCLBcastSendOp,
-                             ops::NCCLBcastSendOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclBcastRecv, ops::NCCLBcastRecvOp,
-                             ops::NCCLBcastRecvOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, ops::NCCLBcastOp,
+                             ops::NCCLBcastOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
                              ops::NCCLReduceOpMaker);
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index 68d0d5b7c9..eb7d4387ef 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -83,6 +83,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
 
     auto ins = ctx.MultiInput<LoDTensor>("X");  // x0, x1, x2
     auto outs = ctx.MultiOutput<LoDTensor>("Out");
+    int root = ctx.Attr<int>("root");
 
     auto* comm = ctx.Input<Communicator>("Communicator");
 
@@ -97,7 +98,9 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     auto ins_names = ctx.Inputs("X");
     std::hash<std::string> hasher;
     for (size_t i = 0; i < ins.size(); ++i) {
-      int root = hasher(ins_names[i]) % comm->comms_.size();
+      if (root == -1) {
+        root = hasher(ins_names[i]) % comm->comms_.size();
+      }
       T* recvbuffer = nullptr;
       if (root == device_id) {
         recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
@@ -135,8 +138,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     int device_id =
         boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(device_id);
+
     if (idx == root) {
-      auto ins = ctx.MultiInput<Tensor>("X");
+      auto ins = ctx.MultiInput<LoDTensor>("X");
       for (size_t i = 0; i < ins.size(); ++i) {
         PADDLE_ENFORCE(platform::dynload::ncclBcast(
             (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
@@ -144,7 +148,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE(cudaStreamSynchronize(stream));
       }
     } else {
-      auto outs = ctx.MultiOutput<Tensor>("Out");
+      auto outs = ctx.MultiOutput<LoDTensor>("Out");
       for (size_t i = 0; i < outs.size(); ++i) {
         PADDLE_ENFORCE(platform::dynload::ncclBcast(
             outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
@@ -160,6 +164,5 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
-REGISTER_OP_GPU_KERNEL(ncclBcastSend, ops::NCCLBcastKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
 REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
-REGISTER_OP_GPU_KERNEL(ncclBcastRecv, ops::NCCLBcastKernel<float>);
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 0eda0c6b57..71491d47bb 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -28,6 +28,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/var_desc.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
@@ -37,8 +38,7 @@
 USE_NO_KERNEL_OP(ncclInit);
 USE_GPU_ONLY_OP(ncclAllReduce);
 USE_GPU_ONLY_OP(ncclReduce);
-USE_GPU_ONLY_OP(ncclBcastSend);
-USE_GPU_ONLY_OP(ncclBcastRecv);
+USE_GPU_ONLY_OP(ncclBcast);
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
@@ -144,12 +144,62 @@ class NCCLTester : public ::testing::Test {
 // }
 
 // ncclAllReduceOp with desc
-TEST_F(NCCLTester, ncclAllReduceOp) {
+// TEST_F(NCCLTester, ncclAllReduceOp) {
+//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+//   op2->SetType("ncclAllReduce");
+//   op2->SetInput("X", {"st"});
+//   op2->SetInput("Communicator", {"comm"});
+//   op2->SetOutput("Out", {"rt"});
+
+//   std::vector<f::Scope *> dev_scopes;
+
+//   std::vector<std::thread> ths;
+
+//   for (size_t i = 0; i < gpu_list.size(); ++i) {
+//     dev_scopes.emplace_back(&g_scope.NewScope());
+//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+//                    *op2.get(), dev_scopes[i]);
+//     ths.emplace_back(std::move(th));
+//   }
+
+//   for (size_t i = 0; i < gpu_list.size(); ++i) {
+//     ths[i].join();
+//   }
+
+//   // check results
+//   float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+
+//   for (size_t i = 0; i < dev_scopes.size(); ++i) {
+//     p::CPUPlace cpu_place;
+//     p::GPUPlace gpu_place(gpu_list[i]);
+
+//     auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
+//     auto *rt = recv_tensor.data<float>();
+//     auto *result_tensor =
+//     dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
+//     result_tensor->Resize(kDims);
+//     auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+//     paddle::memory::Copy(
+//         cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+//         recv_tensor.numel() * sizeof(float),
+//         static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+
+//     for (size_t j = 0; j < f::product(kDims); ++j) {
+//       ASSERT_NEAR(ct[j], result, 1e-5);
+//     }
+//   }
+// }
+
+// ncclAReduceOp with desc
+TEST_F(NCCLTester, ncclReduceOp) {
   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-  op2->SetType("ncclAllReduce");
+  const int kRoot = 0;
+  op2->SetType("ncclReduce");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", {kRoot});
 
   std::vector<f::Scope *> dev_scopes;
 
@@ -166,39 +216,43 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
     ths[i].join();
   }
 
-  // check results
-  float result = 0;
-  std::accumulate(gpu_list.begin(), gpu_list.end(), result);
-  for (size_t i = 0; i < dev_scopes.size(); ++i) {
-    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
-    auto *rt = recv_tensor.data<float>();
+  // check results on
+  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
 
-    p::CPUPlace cpu_place;
-    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
-    result_tensor->Resize(kDims);
-    auto *ct = result_tensor->mutable_data<float>(cpu_place);
+  p::CPUPlace cpu_place;
+  p::GPUPlace gpu_place(gpu_list[kRoot]);
 
-    paddle::memory::Copy(
-        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
-        recv_tensor.numel() * sizeof(float),
-        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
-    for (size_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], result, 1e-5);
-    }
+  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
+  auto *rt = recv_tensor.data<float>();
+  auto *result_tensor =
+      dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
+  result_tensor->Resize(kDims);
+  auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+  paddle::memory::Copy(
+      cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
+      recv_tensor.numel() * sizeof(float),
+      static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
+
+  for (int j = 0; j < f::product(kDims); ++j) {
+    ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
 
-// ncclReduceOp with desc
-TEST(NCCL, ncclReduceOp) {
+// // ncclBcastOp with desc
+TEST_F(NCCLTester, ncclBcastOp) {
   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-  op2->SetType("ncclReduce");
+  const int kRoot = 0;
+  op2->SetType("ncclBcast");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", {kRoot});
 
   std::vector<f::Scope *> dev_scopes;
 
   std::vector<std::thread> ths;
+
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     dev_scopes.emplace_back(&g_scope.NewScope());
     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
@@ -210,76 +264,99 @@ TEST(NCCL, ncclReduceOp) {
     ths[i].join();
   }
 
-  // check results
-  float result = 0;
-  std::accumulate(gpu_list.begin(), gpu_list.end(), result);
-  for (size_t i = 0; i < dev_scopes.size(); ++i) {
-    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
-    auto *rt = recv_tensor.data<float>();
+  const int idx = 1;
+  // check results on
+  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
 
-    p::CPUPlace cpu_place;
-    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
-    result_tensor->Resize(kDims);
-    auto *ct = result_tensor->mutable_data<float>(cpu_place);
+  p::CPUPlace cpu_place;
+  p::GPUPlace gpu_place(gpu_list[idx]);
 
-    paddle::memory::Copy(
-        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
-        recv_tensor.numel() * sizeof(float),
-        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
-    for (size_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], result, 1e-5);
-    }
+  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
+  auto *rt = recv_tensor.data<float>();
+  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
+  result_tensor->Resize(kDims);
+  auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+  paddle::memory::Copy(
+      cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
+      recv_tensor.numel() * sizeof(float),
+      static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
+
+  for (size_t j = 0; j < f::product(kDims); ++j) {
+    ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
 
-// ncclBcastOp with desc
-TEST(NCCL, ncclBcastOp) {
+// joint ncclBcastOp and ncclReduceOp
+TEST_F(NCCLTester, MultipleOp) {
+  const int kRoot = 0;
   std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
-  op1->SetType("ncclBcastSend");
-  op1->SetInput("X", {"st"});
+  op1->SetType("ncclReduce");
+  op1->SetInput("X", {"rt"});
   op1->SetInput("Communicator", {"comm"});
+  op1->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", {kRoot});
 
   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-  op2->SetType("ncclBcastRecv");
+  op2->SetType("ncclBcast");
+  op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", {kRoot});
+
+  std::vector<f::Scope *> dev_scopes;
 
   std::vector<std::thread> ths;
-  for (size_t i = 1; i < gpu_list.size(); ++i) {
+
+  // run Bcast
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), &g_scope.NewScope());
+                   *op1.get(), dev_scopes[i]);
     ths.emplace_back(std::move(th));
   }
 
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     ths[i].join();
   }
-}
 
-// joint ncclBcastOp and ncclReduceOp
-// TEST(NCCL, MultipleOp) {
-//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-//   op2->SetType("ncclBcastSend");
-//   op2->SetInput("X", {"st"});
-//   op2->SetInput("Communicator", {"comm"});
+  ths.clear();
 
-//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-//   op2->SetType("ncclBcastRecv");
-//   op2->SetInput("Communicator", {"comm"});
-//   op2->SetOutput("Out", {"rt"});
+  // run Reduce
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
 
-//   std::vector<std::thread> ths;
-//   for (size_t i = 0; i < gpu_list.size(); ++i) {
-//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-//     *op2.get(),
-//                    &g_scope.NewScope());
-//     ths.emplace_back(std::move(th));
-//   }
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
 
-//   for (size_t i = 0; i < gpu_list.size(); ++i) {
-//     ths[i].join();
-//   }
-// }
+  // check results
+  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+
+  for (size_t i = 0; i < dev_scopes.size(); ++i) {
+    p::CPUPlace cpu_place;
+    p::GPUPlace gpu_place(gpu_list[i]);
+
+    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
+    auto *rt = recv_tensor.data<float>();
+    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
+    result_tensor->Resize(kDims);
+    auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+    paddle::memory::Copy(
+        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+        recv_tensor.numel() * sizeof(float),
+        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+
+    for (int j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], result, 1e-5);
+    }
+  }
+}
 
 int main(int argc, char **argv) {
   const int dev_count = p::GetCUDADeviceCount();

From 61c1b0469a4d320a1f328ceac85052625e666254 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 15:26:16 -0700
Subject: [PATCH 36/81] "fix multigpu testcase"

---
 paddle/operators/nccl_op.cu      |   8 ++
 paddle/operators/nccl_op_test.cu | 130 +++++++++++++++----------------
 2 files changed, 72 insertions(+), 66 deletions(-)

diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index eb7d4387ef..9b9e1df258 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -142,18 +142,26 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     if (idx == root) {
       auto ins = ctx.MultiInput<LoDTensor>("X");
       for (size_t i = 0; i < ins.size(); ++i) {
+        VLOG(1) << " invoke Bcast. send " << ins[i]->numel();
+
         PADDLE_ENFORCE(platform::dynload::ncclBcast(
             (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
             root, comm->comms_[idx], stream));
         PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+        VLOG(1) << " finished Bcast.";
       }
     } else {
       auto outs = ctx.MultiOutput<LoDTensor>("Out");
       for (size_t i = 0; i < outs.size(); ++i) {
+        VLOG(1) << " invoke Bcast. recv. ";
+
         PADDLE_ENFORCE(platform::dynload::ncclBcast(
             outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
             NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
         PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+        VLOG(1) << " finished Bcast. recv " << outs[i]->numel();
       }
     }
   }
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 71491d47bb..d785b279d6 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -123,73 +123,71 @@ class NCCLTester : public ::testing::Test {
 };
 
 // ncclInitOp with desc
-// TEST(NCCL, ncclInitOp) {
-//   std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
+TEST(NCCL, ncclInitOp) {
+  std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
 
-//   op_desc->SetType("ncclInit");
-//   op_desc->SetOutput("Communicator", {"x1"});
-//   op_desc->SetAttr("gpus", {gpu_list});
+  op_desc->SetType("ncclInit");
+  op_desc->SetOutput("Communicator", {"x1"});
+  op_desc->SetAttr("gpus", {gpu_list});
 
-//   f::Scope g_scope;
-//   std::unique_ptr<p::DeviceContext> ctx(new
-//   p::CPUDeviceContext(p::CPUPlace()));
+  f::Scope g_scope;
+  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
 
-//   auto *var = g_scope.Var("x1");
-//   var->GetMutable<p::Communicator>();
+  auto *var = g_scope.Var("x1");
+  var->GetMutable<p::Communicator>();
 
-//   auto op = f::OpRegistry::CreateOp(*op_desc);
-//   VLOG(1) << "invoke NCCLInitOp.";
-//   op->Run(g_scope, *ctx.get());
-//   VLOG(1) << "NCCLInitOp finished.";
-// }
+  auto op = f::OpRegistry::CreateOp(*op_desc);
+  VLOG(1) << "invoke NCCLInitOp.";
+  op->Run(g_scope, *ctx.get());
+  VLOG(1) << "NCCLInitOp finished.";
+}
 
 // ncclAllReduceOp with desc
-// TEST_F(NCCLTester, ncclAllReduceOp) {
-//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-//   op2->SetType("ncclAllReduce");
-//   op2->SetInput("X", {"st"});
-//   op2->SetInput("Communicator", {"comm"});
-//   op2->SetOutput("Out", {"rt"});
-
-//   std::vector<f::Scope *> dev_scopes;
-
-//   std::vector<std::thread> ths;
-
-//   for (size_t i = 0; i < gpu_list.size(); ++i) {
-//     dev_scopes.emplace_back(&g_scope.NewScope());
-//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-//                    *op2.get(), dev_scopes[i]);
-//     ths.emplace_back(std::move(th));
-//   }
-
-//   for (size_t i = 0; i < gpu_list.size(); ++i) {
-//     ths[i].join();
-//   }
-
-//   // check results
-//   float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
-
-//   for (size_t i = 0; i < dev_scopes.size(); ++i) {
-//     p::CPUPlace cpu_place;
-//     p::GPUPlace gpu_place(gpu_list[i]);
-
-//     auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
-//     auto *rt = recv_tensor.data<float>();
-//     auto *result_tensor =
-//     dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
-//     result_tensor->Resize(kDims);
-//     auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-//     paddle::memory::Copy(
-//         cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
-//         recv_tensor.numel() * sizeof(float),
-//         static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
-
-//     for (size_t j = 0; j < f::product(kDims); ++j) {
-//       ASSERT_NEAR(ct[j], result, 1e-5);
-//     }
-//   }
-// }
+TEST_F(NCCLTester, ncclAllReduceOp) {
+  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  op2->SetType("ncclAllReduce");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+
+  std::vector<f::Scope *> dev_scopes;
+
+  std::vector<std::thread> ths;
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+
+  // check results
+  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+
+  for (size_t i = 0; i < dev_scopes.size(); ++i) {
+    p::CPUPlace cpu_place;
+    p::GPUPlace gpu_place(gpu_list[i]);
+
+    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
+    auto *rt = recv_tensor.data<float>();
+    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
+    result_tensor->Resize(kDims);
+    auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+    paddle::memory::Copy(
+        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+        recv_tensor.numel() * sizeof(float),
+        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+
+    for (size_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], result, 1e-5);
+    }
+  }
+}
 
 // ncclAReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
@@ -242,7 +240,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
 // // ncclBcastOp with desc
 TEST_F(NCCLTester, ncclBcastOp) {
   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-  const int kRoot = 0;
+  const int kRoot = 5;
   op2->SetType("ncclBcast");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
@@ -266,7 +264,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
 
   const int idx = 1;
   // check results on
-  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+  float result = kRoot;
 
   p::CPUPlace cpu_place;
   p::GPUPlace gpu_place(gpu_list[idx]);
@@ -292,14 +290,14 @@ TEST_F(NCCLTester, MultipleOp) {
   const int kRoot = 0;
   std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
   op1->SetType("ncclReduce");
-  op1->SetInput("X", {"rt"});
+  op1->SetInput("X", {"st"});
   op1->SetInput("Communicator", {"comm"});
   op1->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", {kRoot});
+  op1->SetAttr("root", {kRoot});
 
   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
   op2->SetType("ncclBcast");
-  op2->SetInput("X", {"st"});
+  op2->SetInput("X", {"rt"});
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
   op2->SetAttr("root", {kRoot});

From 4e165f4ea36902b5c85a42d71626d4ba5816869a Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 17:35:33 -0700
Subject: [PATCH 37/81] "fix create output variable bug"

---
 paddle/operators/nccl_op.cc      |   3 +
 paddle/operators/nccl_op.cu      |  44 ++--
 paddle/operators/nccl_op_test.cu | 364 ++++++++++++++++---------------
 3 files changed, 214 insertions(+), 197 deletions(-)

diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 67bcc419fa..6a0589cb20 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -114,6 +114,9 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    " Output(Out) of Bcast op output should not be NULL");
 
+    int root = ctx->Attrs().Get<int>("root");
+    PADDLE_ENFORCE(root != -1, "Bcast root must be set.");
+
     auto x_dims = ctx->GetInputsDim("X");
     ctx->SetOutputsDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index 9b9e1df258..1eef2f218f 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -54,12 +54,12 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
                       ctx.device_context())
                       .stream();
     // device id
-    int device_id =
-        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
-    int idx = comm->GetCommId(device_id);
+    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(gpu_id);
 
     for (size_t i = 0; i < ins.size(); ++i) {
-      VLOG(1) << " invoke allreduce. send " << ins[i]->numel() << " recv "
+      VLOG(1) << "gpu : "
+              << " invoke allreduce. send " << ins[i]->numel() << " recv "
               << outs[i]->numel();
 
       PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
@@ -68,7 +68,8 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
           comm->comms_[idx], stream));
       PADDLE_ENFORCE(cudaStreamSynchronize(stream));
 
-      VLOG(1) << " finished allreduce. send " << ins[i]->numel() << " recv "
+      VLOG(1) << "gpu : "
+              << " finished allreduce. send " << ins[i]->numel() << " recv "
               << outs[i]->numel();
     }
   }
@@ -91,9 +92,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
                       ctx.device_context())
                       .stream();
     // device id
-    int device_id =
-        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
-    int idx = comm->GetCommId(device_id);
+    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(gpu_id);
 
     auto ins_names = ctx.Inputs("X");
     std::hash<std::string> hasher;
@@ -102,20 +102,20 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
         root = hasher(ins_names[i]) % comm->comms_.size();
       }
       T* recvbuffer = nullptr;
-      if (root == device_id) {
+      if (root == gpu_id) {
         recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
       }
 
-      VLOG(1) << " invoke reduce. send " << ins[i]->numel() << " recv "
-              << outs[i]->numel();
+      VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send "
+              << ins[i]->numel() << " recv " << outs[i]->numel();
 
       PADDLE_ENFORCE(platform::dynload::ncclReduce(
           ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
           NCCLTypeWrapper<T>::type, ncclSum, root, comm->comms_[idx], stream));
       PADDLE_ENFORCE(cudaStreamSynchronize(stream));
 
-      VLOG(1) << " finished reduce. send " << ins[i]->numel() << " recv "
-              << outs[i]->numel();
+      VLOG(1) << "gpu : " << gpu_id << " finished reduce. send "
+              << ins[i]->numel() << " recv " << outs[i]->numel();
     }
   }
 };
@@ -135,33 +135,37 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
                       ctx.device_context())
                       .stream();
     // device id
-    int device_id =
-        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
-    int idx = comm->GetCommId(device_id);
+    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(gpu_id);
 
     if (idx == root) {
       auto ins = ctx.MultiInput<LoDTensor>("X");
       for (size_t i = 0; i < ins.size(); ++i) {
-        VLOG(1) << " invoke Bcast. send " << ins[i]->numel();
+        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send "
+                << ins[i]->numel();
 
+        VLOG(1) << " before ncclBcast";
         PADDLE_ENFORCE(platform::dynload::ncclBcast(
             (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
             root, comm->comms_[idx], stream));
+        VLOG(1) << " after ncclBcast";
         PADDLE_ENFORCE(cudaStreamSynchronize(stream));
 
-        VLOG(1) << " finished Bcast.";
+        VLOG(1) << "gpu : " << gpu_id << " finished Bcast.";
       }
     } else {
       auto outs = ctx.MultiOutput<LoDTensor>("Out");
       for (size_t i = 0; i < outs.size(); ++i) {
-        VLOG(1) << " invoke Bcast. recv. ";
+        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
+                << framework::product(outs[i]->dims());
 
         PADDLE_ENFORCE(platform::dynload::ncclBcast(
             outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
             NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
         PADDLE_ENFORCE(cudaStreamSynchronize(stream));
 
-        VLOG(1) << " finished Bcast. recv " << outs[i]->numel();
+        VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv "
+                << outs[i]->numel();
       }
     }
   }
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index d785b279d6..1132c3d43d 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -87,30 +87,34 @@ class NCCLTester : public ::testing::Test {
   void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc,
                         f::Scope *scope) {
     std::unique_lock<std::mutex> lk(mu);
-    f::ProgramDescBind program;
-    f::BlockDescBind *block = program.Block(0);
-    f::OpDescBind *op1 = block->AppendOp();
-    *op1 = op_desc;
+    const f::OpDescBind *op1 = &op_desc;
 
     p::GPUPlace place(gpu_id);
     auto &ctx = dev_ctxs.at(gpu_id);
 
     auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
     auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
-    send_tensor->Resize(kDims);
-    send_tensor->mutable_data<T>(kDims, place);
 
-    std::vector<T> send_vector(f::product(kDims), gpu_id);
-    send_tensor->CopyFromVector<T>(send_vector, *ctx);
+    if (!send_tensor->numel()) {
+      send_tensor->Resize(kDims);
+      send_tensor->mutable_data<T>(kDims, place);
+
+      std::vector<T> send_vector(f::product(kDims), gpu_id);
+      send_tensor->CopyFromVector<T>(send_vector, *ctx);
+      ctx->Wait();
+      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
+    }
+
     lk.unlock();
+
     PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
                    "Tensor numel not match!");
-    ctx->Wait();
-
-    VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
 
     auto op = f::OpRegistry::CreateOp(*op1);
+
     VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
+    VLOG(1) << " send_tensor : " << send_tensor->numel()
+            << " recv_tensor : " << recv_tensor->numel();
     op->Run(*scope, *ctx);
     VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
   }
@@ -122,168 +126,171 @@ class NCCLTester : public ::testing::Test {
   std::mutex mu;
 };
 
-// ncclInitOp with desc
-TEST(NCCL, ncclInitOp) {
-  std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
-
-  op_desc->SetType("ncclInit");
-  op_desc->SetOutput("Communicator", {"x1"});
-  op_desc->SetAttr("gpus", {gpu_list});
-
-  f::Scope g_scope;
-  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
-
-  auto *var = g_scope.Var("x1");
-  var->GetMutable<p::Communicator>();
-
-  auto op = f::OpRegistry::CreateOp(*op_desc);
-  VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx.get());
-  VLOG(1) << "NCCLInitOp finished.";
-}
-
-// ncclAllReduceOp with desc
-TEST_F(NCCLTester, ncclAllReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-  op2->SetType("ncclAllReduce");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    ths[i].join();
-  }
-
-  // check results
-  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
-
-  for (size_t i = 0; i < dev_scopes.size(); ++i) {
-    p::CPUPlace cpu_place;
-    p::GPUPlace gpu_place(gpu_list[i]);
-
-    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
-    auto *rt = recv_tensor.data<float>();
-    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
-    result_tensor->Resize(kDims);
-    auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-    paddle::memory::Copy(
-        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
-        recv_tensor.numel() * sizeof(float),
-        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
-
-    for (size_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], result, 1e-5);
-    }
-  }
-}
-
-// ncclAReduceOp with desc
-TEST_F(NCCLTester, ncclReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-  const int kRoot = 0;
-  op2->SetType("ncclReduce");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", {kRoot});
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    ths[i].join();
-  }
-
-  // check results on
-  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
-
-  p::CPUPlace cpu_place;
-  p::GPUPlace gpu_place(gpu_list[kRoot]);
-
-  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
-  auto *rt = recv_tensor.data<float>();
-  auto *result_tensor =
-      dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
-  result_tensor->Resize(kDims);
-  auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-  paddle::memory::Copy(
-      cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
-      recv_tensor.numel() * sizeof(float),
-      static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
-
-  for (int j = 0; j < f::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], result, 1e-5);
-  }
-}
-
-// // ncclBcastOp with desc
-TEST_F(NCCLTester, ncclBcastOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-  const int kRoot = 5;
-  op2->SetType("ncclBcast");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", {kRoot});
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    ths[i].join();
-  }
-
-  const int idx = 1;
-  // check results on
-  float result = kRoot;
-
-  p::CPUPlace cpu_place;
-  p::GPUPlace gpu_place(gpu_list[idx]);
-
-  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
-  auto *rt = recv_tensor.data<float>();
-  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
-  result_tensor->Resize(kDims);
-  auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-  paddle::memory::Copy(
-      cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
-      recv_tensor.numel() * sizeof(float),
-      static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
-
-  for (size_t j = 0; j < f::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], result, 1e-5);
-  }
-}
+// // ncclInitOp with desc
+// TEST(NCCL, ncclInitOp) {
+//   std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
+
+//   op_desc->SetType("ncclInit");
+//   op_desc->SetOutput("Communicator", {"x1"});
+//   op_desc->SetAttr("gpus", {gpu_list});
+
+//   f::Scope g_scope;
+//   std::unique_ptr<p::DeviceContext> ctx(new
+//   p::CPUDeviceContext(p::CPUPlace()));
+
+//   auto *var = g_scope.Var("x1");
+//   var->GetMutable<p::Communicator>();
+
+//   auto op = f::OpRegistry::CreateOp(*op_desc);
+//   VLOG(1) << "invoke NCCLInitOp.";
+//   op->Run(g_scope, *ctx.get());
+//   VLOG(1) << "NCCLInitOp finished.";
+// }
+
+// // ncclAllReduceOp with desc
+// TEST_F(NCCLTester, ncclAllReduceOp) {
+//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+//   op2->SetType("ncclAllReduce");
+//   op2->SetInput("X", {"st"});
+//   op2->SetInput("Communicator", {"comm"});
+//   op2->SetOutput("Out", {"rt"});
+
+//   std::vector<f::Scope *> dev_scopes;
+
+//   std::vector<std::thread> ths;
+
+//   for (size_t i = 0; i < gpu_list.size(); ++i) {
+//     dev_scopes.emplace_back(&g_scope.NewScope());
+//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+//                    *op2.get(), dev_scopes[i]);
+//     ths.emplace_back(std::move(th));
+//   }
+
+//   for (size_t i = 0; i < gpu_list.size(); ++i) {
+//     ths[i].join();
+//   }
+
+//   // check results
+//   float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+
+//   for (size_t i = 0; i < dev_scopes.size(); ++i) {
+//     p::CPUPlace cpu_place;
+//     p::GPUPlace gpu_place(gpu_list[i]);
+
+//     auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
+//     auto *rt = recv_tensor.data<float>();
+//     auto *result_tensor =
+//     dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
+//     result_tensor->Resize(kDims);
+//     auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+//     paddle::memory::Copy(
+//         cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+//         recv_tensor.numel() * sizeof(float),
+//         static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+
+//     for (size_t j = 0; j < f::product(kDims); ++j) {
+//       ASSERT_NEAR(ct[j], result, 1e-5);
+//     }
+//   }
+// }
+
+// // ncclAReduceOp with desc
+// TEST_F(NCCLTester, ncclReduceOp) {
+//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+//   const int kRoot = 0;
+//   op2->SetType("ncclReduce");
+//   op2->SetInput("X", {"st"});
+//   op2->SetInput("Communicator", {"comm"});
+//   op2->SetOutput("Out", {"rt"});
+//   op2->SetAttr("root", {kRoot});
+
+//   std::vector<f::Scope *> dev_scopes;
+
+//   std::vector<std::thread> ths;
+
+//   for (size_t i = 0; i < gpu_list.size(); ++i) {
+//     dev_scopes.emplace_back(&g_scope.NewScope());
+//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+//                    *op2.get(), dev_scopes[i]);
+//     ths.emplace_back(std::move(th));
+//   }
+
+//   for (size_t i = 0; i < gpu_list.size(); ++i) {
+//     ths[i].join();
+//   }
+
+//   // check results on
+//   float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+
+//   p::CPUPlace cpu_place;
+//   p::GPUPlace gpu_place(gpu_list[kRoot]);
+
+//   auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
+//   auto *rt = recv_tensor.data<float>();
+//   auto *result_tensor =
+//       dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
+//   result_tensor->Resize(kDims);
+//   auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+//   paddle::memory::Copy(
+//       cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
+//       recv_tensor.numel() * sizeof(float),
+//       static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
+
+//   for (int j = 0; j < f::product(kDims); ++j) {
+//     ASSERT_NEAR(ct[j], result, 1e-5);
+//   }
+// }
+
+// // // ncclBcastOp with desc
+// TEST_F(NCCLTester, ncclBcastOp) {
+//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+//   const int kRoot = 5;
+//   op2->SetType("ncclBcast");
+//   op2->SetInput("X", {"st"});
+//   op2->SetInput("Communicator", {"comm"});
+//   op2->SetOutput("Out", {"rt"});
+//   op2->SetAttr("root", {kRoot});
+
+//   std::vector<f::Scope *> dev_scopes;
+
+//   std::vector<std::thread> ths;
+
+//   for (size_t i = 0; i < gpu_list.size(); ++i) {
+//     dev_scopes.emplace_back(&g_scope.NewScope());
+//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+//                    *op2.get(), dev_scopes[i]);
+//     ths.emplace_back(std::move(th));
+//   }
+
+//   for (size_t i = 0; i < gpu_list.size(); ++i) {
+//     ths[i].join();
+//   }
+
+//   const int idx = 1;
+//   // check results on
+//   float result = kRoot;
+
+//   p::CPUPlace cpu_place;
+//   p::GPUPlace gpu_place(gpu_list[idx]);
+
+//   auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
+//   auto *rt = recv_tensor.data<float>();
+//   auto *result_tensor =
+//   dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
+//   result_tensor->Resize(kDims);
+//   auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+//   paddle::memory::Copy(
+//       cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
+//       recv_tensor.numel() * sizeof(float),
+//       static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
+
+//   for (size_t j = 0; j < f::product(kDims); ++j) {
+//     ASSERT_NEAR(ct[j], result, 1e-5);
+//   }
+// }
 
 // joint ncclBcastOp and ncclReduceOp
 TEST_F(NCCLTester, MultipleOp) {
@@ -299,14 +306,17 @@ TEST_F(NCCLTester, MultipleOp) {
   op2->SetType("ncclBcast");
   op2->SetInput("X", {"rt"});
   op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
+  op2->SetOutput("Out", {"out"});
   op2->SetAttr("root", {kRoot});
 
   std::vector<f::Scope *> dev_scopes;
+  // for (size_t i = 0; i < dev_scopes.size(); ++i) {
+  //   dev_scopes[i]->Var("out")->GetMutable<f::LoDTensor>();
+  // }
 
   std::vector<std::thread> ths;
 
-  // run Bcast
+  // run Reduce
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     dev_scopes.emplace_back(&g_scope.NewScope());
     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
@@ -320,9 +330,9 @@ TEST_F(NCCLTester, MultipleOp) {
 
   ths.clear();
 
-  // run Reduce
+  // run Bcast
   for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope.NewScope());
+    dev_scopes[i]->Var("out")->GetMutable<f::LoDTensor>();
     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
                    *op2.get(), dev_scopes[i]);
     ths.emplace_back(std::move(th));

From 2573ac1448944df17f055b18d1c21519fe07d5ef Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 17:57:11 -0700
Subject: [PATCH 38/81] "remove python side test case to another PR."

---
 paddle/operators/nccl_op_test.cu              | 319 +++++++-----------
 .../framework/tests/test_nccl_allreduce_op.py |  97 ------
 .../v2/framework/tests/test_nccl_reduce_op.py |  25 --
 3 files changed, 121 insertions(+), 320 deletions(-)
 delete mode 100644 python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
 delete mode 100644 python/paddle/v2/framework/tests/test_nccl_reduce_op.py

diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 1132c3d43d..63a286f602 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -126,213 +126,40 @@ class NCCLTester : public ::testing::Test {
   std::mutex mu;
 };
 
-// // ncclInitOp with desc
-// TEST(NCCL, ncclInitOp) {
-//   std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
-
-//   op_desc->SetType("ncclInit");
-//   op_desc->SetOutput("Communicator", {"x1"});
-//   op_desc->SetAttr("gpus", {gpu_list});
-
-//   f::Scope g_scope;
-//   std::unique_ptr<p::DeviceContext> ctx(new
-//   p::CPUDeviceContext(p::CPUPlace()));
-
-//   auto *var = g_scope.Var("x1");
-//   var->GetMutable<p::Communicator>();
-
-//   auto op = f::OpRegistry::CreateOp(*op_desc);
-//   VLOG(1) << "invoke NCCLInitOp.";
-//   op->Run(g_scope, *ctx.get());
-//   VLOG(1) << "NCCLInitOp finished.";
-// }
-
-// // ncclAllReduceOp with desc
-// TEST_F(NCCLTester, ncclAllReduceOp) {
-//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-//   op2->SetType("ncclAllReduce");
-//   op2->SetInput("X", {"st"});
-//   op2->SetInput("Communicator", {"comm"});
-//   op2->SetOutput("Out", {"rt"});
-
-//   std::vector<f::Scope *> dev_scopes;
-
-//   std::vector<std::thread> ths;
-
-//   for (size_t i = 0; i < gpu_list.size(); ++i) {
-//     dev_scopes.emplace_back(&g_scope.NewScope());
-//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-//                    *op2.get(), dev_scopes[i]);
-//     ths.emplace_back(std::move(th));
-//   }
-
-//   for (size_t i = 0; i < gpu_list.size(); ++i) {
-//     ths[i].join();
-//   }
-
-//   // check results
-//   float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
-
-//   for (size_t i = 0; i < dev_scopes.size(); ++i) {
-//     p::CPUPlace cpu_place;
-//     p::GPUPlace gpu_place(gpu_list[i]);
-
-//     auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
-//     auto *rt = recv_tensor.data<float>();
-//     auto *result_tensor =
-//     dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
-//     result_tensor->Resize(kDims);
-//     auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-//     paddle::memory::Copy(
-//         cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
-//         recv_tensor.numel() * sizeof(float),
-//         static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
-
-//     for (size_t j = 0; j < f::product(kDims); ++j) {
-//       ASSERT_NEAR(ct[j], result, 1e-5);
-//     }
-//   }
-// }
-
-// // ncclAReduceOp with desc
-// TEST_F(NCCLTester, ncclReduceOp) {
-//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-//   const int kRoot = 0;
-//   op2->SetType("ncclReduce");
-//   op2->SetInput("X", {"st"});
-//   op2->SetInput("Communicator", {"comm"});
-//   op2->SetOutput("Out", {"rt"});
-//   op2->SetAttr("root", {kRoot});
-
-//   std::vector<f::Scope *> dev_scopes;
-
-//   std::vector<std::thread> ths;
-
-//   for (size_t i = 0; i < gpu_list.size(); ++i) {
-//     dev_scopes.emplace_back(&g_scope.NewScope());
-//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-//                    *op2.get(), dev_scopes[i]);
-//     ths.emplace_back(std::move(th));
-//   }
-
-//   for (size_t i = 0; i < gpu_list.size(); ++i) {
-//     ths[i].join();
-//   }
-
-//   // check results on
-//   float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
-
-//   p::CPUPlace cpu_place;
-//   p::GPUPlace gpu_place(gpu_list[kRoot]);
-
-//   auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
-//   auto *rt = recv_tensor.data<float>();
-//   auto *result_tensor =
-//       dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
-//   result_tensor->Resize(kDims);
-//   auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-//   paddle::memory::Copy(
-//       cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
-//       recv_tensor.numel() * sizeof(float),
-//       static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
-
-//   for (int j = 0; j < f::product(kDims); ++j) {
-//     ASSERT_NEAR(ct[j], result, 1e-5);
-//   }
-// }
-
-// // // ncclBcastOp with desc
-// TEST_F(NCCLTester, ncclBcastOp) {
-//   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-//   const int kRoot = 5;
-//   op2->SetType("ncclBcast");
-//   op2->SetInput("X", {"st"});
-//   op2->SetInput("Communicator", {"comm"});
-//   op2->SetOutput("Out", {"rt"});
-//   op2->SetAttr("root", {kRoot});
-
-//   std::vector<f::Scope *> dev_scopes;
-
-//   std::vector<std::thread> ths;
-
-//   for (size_t i = 0; i < gpu_list.size(); ++i) {
-//     dev_scopes.emplace_back(&g_scope.NewScope());
-//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-//                    *op2.get(), dev_scopes[i]);
-//     ths.emplace_back(std::move(th));
-//   }
-
-//   for (size_t i = 0; i < gpu_list.size(); ++i) {
-//     ths[i].join();
-//   }
-
-//   const int idx = 1;
-//   // check results on
-//   float result = kRoot;
-
-//   p::CPUPlace cpu_place;
-//   p::GPUPlace gpu_place(gpu_list[idx]);
-
-//   auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
-//   auto *rt = recv_tensor.data<float>();
-//   auto *result_tensor =
-//   dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
-//   result_tensor->Resize(kDims);
-//   auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-//   paddle::memory::Copy(
-//       cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
-//       recv_tensor.numel() * sizeof(float),
-//       static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
-
-//   for (size_t j = 0; j < f::product(kDims); ++j) {
-//     ASSERT_NEAR(ct[j], result, 1e-5);
-//   }
-// }
-
-// joint ncclBcastOp and ncclReduceOp
-TEST_F(NCCLTester, MultipleOp) {
-  const int kRoot = 0;
-  std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
-  op1->SetType("ncclReduce");
-  op1->SetInput("X", {"st"});
-  op1->SetInput("Communicator", {"comm"});
-  op1->SetOutput("Out", {"rt"});
-  op1->SetAttr("root", {kRoot});
+// ncclInitOp with desc
+TEST(NCCL, ncclInitOp) {
+  std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
+
+  op_desc->SetType("ncclInit");
+  op_desc->SetOutput("Communicator", {"x1"});
+  op_desc->SetAttr("gpus", {gpu_list});
+
+  f::Scope g_scope;
+  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
+
+  auto *var = g_scope.Var("x1");
+  var->GetMutable<p::Communicator>();
 
+  auto op = f::OpRegistry::CreateOp(*op_desc);
+  VLOG(1) << "invoke NCCLInitOp.";
+  op->Run(g_scope, *ctx.get());
+  VLOG(1) << "NCCLInitOp finished.";
+}
+
+// ncclAllReduceOp with desc
+TEST_F(NCCLTester, ncclAllReduceOp) {
   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
-  op2->SetType("ncclBcast");
-  op2->SetInput("X", {"rt"});
+  op2->SetType("ncclAllReduce");
+  op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"out"});
-  op2->SetAttr("root", {kRoot});
+  op2->SetOutput("Out", {"rt"});
 
   std::vector<f::Scope *> dev_scopes;
-  // for (size_t i = 0; i < dev_scopes.size(); ++i) {
-  //   dev_scopes[i]->Var("out")->GetMutable<f::LoDTensor>();
-  // }
 
   std::vector<std::thread> ths;
 
-  // run Reduce
   for (size_t i = 0; i < gpu_list.size(); ++i) {
     dev_scopes.emplace_back(&g_scope.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op1.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    ths[i].join();
-  }
-
-  ths.clear();
-
-  // run Bcast
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes[i]->Var("out")->GetMutable<f::LoDTensor>();
     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
                    *op2.get(), dev_scopes[i]);
     ths.emplace_back(std::move(th));
@@ -360,12 +187,108 @@ TEST_F(NCCLTester, MultipleOp) {
         recv_tensor.numel() * sizeof(float),
         static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
 
-    for (int j = 0; j < f::product(kDims); ++j) {
+    for (size_t j = 0; j < f::product(kDims); ++j) {
       ASSERT_NEAR(ct[j], result, 1e-5);
     }
   }
 }
 
+// ncclAReduceOp with desc
+TEST_F(NCCLTester, ncclReduceOp) {
+  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  const int kRoot = 0;
+  op2->SetType("ncclReduce");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", {kRoot});
+
+  std::vector<f::Scope *> dev_scopes;
+
+  std::vector<std::thread> ths;
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+
+  // check results on
+  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+
+  p::CPUPlace cpu_place;
+  p::GPUPlace gpu_place(gpu_list[kRoot]);
+
+  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
+  auto *rt = recv_tensor.data<float>();
+  auto *result_tensor =
+      dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
+  result_tensor->Resize(kDims);
+  auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+  paddle::memory::Copy(
+      cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
+      recv_tensor.numel() * sizeof(float),
+      static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
+
+  for (int j = 0; j < f::product(kDims); ++j) {
+    ASSERT_NEAR(ct[j], result, 1e-5);
+  }
+}
+
+// // ncclBcastOp with desc
+TEST_F(NCCLTester, ncclBcastOp) {
+  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  const int kRoot = 5;
+  op2->SetType("ncclBcast");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", {kRoot});
+
+  std::vector<f::Scope *> dev_scopes;
+
+  std::vector<std::thread> ths;
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+
+  const int idx = 1;
+  // check results on
+  float result = kRoot;
+
+  p::CPUPlace cpu_place;
+  p::GPUPlace gpu_place(gpu_list[idx]);
+
+  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
+  auto *rt = recv_tensor.data<float>();
+  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
+  result_tensor->Resize(kDims);
+  auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+  paddle::memory::Copy(
+      cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
+      recv_tensor.numel() * sizeof(float),
+      static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
+
+  for (size_t j = 0; j < f::product(kDims); ++j) {
+    ASSERT_NEAR(ct[j], result, 1e-5);
+  }
+}
+
 int main(int argc, char **argv) {
   const int dev_count = p::GetCUDADeviceCount();
   if (dev_count <= 1) {
diff --git a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
deleted file mode 100644
index 0a9163dd55..0000000000
--- a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import unittest, os
-from threading import Thread
-import numpy as np
-import paddle.v2 as paddle
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
-from op_test import OpTest, create_op, set_input
-
-# gpu_list = os.environ["NV_LIST"]
-gpu_list = "0,1,2,3"
-
-if not core.is_compile_gpu() or not gpu_list:
-    exit(0)
-
-g_scope = core.Scope()
-g_ctx = core.DeviceContext.create(core.CPUPlace())
-gpus = [int(g) for g in gpu_list.split(",")]
-
-
-# ground truth
-def allreduce(tensors, gpus):
-    num_device = len(gpus)
-    assert (len(tensors) == num_device), "not match of tensor and device"
-    Out = tensors
-    for i in range(1, len(tensors)):
-        Out[0] += Out[i]
-
-    for i in range(1, len(tensors)):
-        Out[i] = Out[0]
-
-    return Out
-
-
-input_data = [
-    np.random.random((32, 32)).astype("float32") for i in range(len(gpus))
-]
-output_data = allreduce(input_data, gpus)
-
-
-def thread_allreduce_op(thread_id, gpu_id):
-    i = gpu_id
-    scope = g_scope.new_scope()
-    place = core.GPUPlace(gpus[i])
-    inputs = {
-        "X": input_data[i],
-        "Communicator": scope.find_var("Communicator")
-    }
-    outputs = {"Out": output_data[i]}
-
-    op = create_op(scope, "ncclAllReduce", inputs, outputs, attrs={})
-    place = core.GPUPlace(gpus[i])
-    set_input(scope, op, inputs, place)
-
-    ctx = core.DeviceContext.create(place)
-
-    print "thread_id : ", thread_id, "gpu_id : ", gpu_id, " invoke allreduce"
-    op.run(scope, ctx)
-    print "thread_id : ", thread_id, "gpu_id : ", gpu_id, " allreduce Done."
-
-
-class TestNCCLAllReduce(unittest.TestCase):
-    def setUp(self):
-        self.op_type = "ncclAllReduce"
-
-        nccl_init = create_op(
-            g_scope,
-            op_type="ncclInit",
-            inputs={},
-            outputs={
-                "Communicator": g_scope.var("Communicator").get_communicator()
-            },
-            attrs={"gpus": gpus})
-        nccl_init.run(g_scope, g_ctx)
-
-    def test_output(self):
-        ops = []
-        for i in range(len(gpus)):
-            th = Thread(
-                target=thread_allreduce_op, args=(
-                    i,
-                    gpus[i], ))
-            th.start()
-            ops.append(th)
-        for t in ops:
-            t.join()
-
-        idx = 0
-        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
-            actual = np.array(g_scope.find_var(out_name).get_tensor())
-            expect = output_data[idx]
-
-            idx += 1
-            self.assertTrue(actual, expect), "has diff"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_nccl_reduce_op.py b/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
deleted file mode 100644
index 0cee1923a6..0000000000
--- a/python/paddle/v2/framework/tests/test_nccl_reduce_op.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import unittest, os
-import numpy as np
-import paddle.v2 as paddle
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
-from op_test import OpTest, create_op, set_input
-
-gpu_list = "0,1,2,3"
-g_scope = core.Scope()
-g_ctx = core.DeviceContext.create(core.CPUPlace())
-
-if not core.is_compile_gpu() or not gpu_list:
-    exit(0)
-
-
-class TestNCCLReduce(OpTest):
-    def setUp(self):
-        self.op_type = "ncclReduce"
-        self.gpus = [int(g) for g in gpu_list.split(",")]
-
-        self.scope = g_scope.var("Communicator").get_communicator()
-        self.outputs = {"Communicator": self.scope.var("Communicator")}
-
-    def test_check_output(self):
-        self.check_output()

From 626ff3b79e60a8e221f647ddf3450173a2e8613f Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 19:18:02 -0700
Subject: [PATCH 39/81] "polish cmake file"

---
 paddle/operators/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 8b393961fd..7ddceb70d1 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -88,7 +88,6 @@ function(op_library TARGET)
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
         file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
-        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(ncclInit);\n")
     endif()
     
     # reduce_op contains several operators

From 4b9cf0e8b116e28f20f46c407f7d3f675eca1424 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Wed, 25 Oct 2017 20:23:13 -0700
Subject: [PATCH 40/81] "add disable"

---
 paddle/operators/nccl/nccl_gpu_common.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index fe49d19a9d..eead7f79b7 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -25,6 +25,7 @@
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/dynload/nccl.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace platform {
@@ -51,7 +52,7 @@ struct Communicator {
     }
   }
 
-  // DISABLE_COPY_AND_ASSIGN(Communicator);
+  DISABLE_COPY_AND_ASSIGN(Communicator);
 };
 
 }  // namespace platform

From db1bb8224aa78a166e04c690a007ca9fa4746d9d Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 26 Oct 2017 20:59:17 +0800
Subject: [PATCH 41/81] follow comments

---
 paddle/operators/math/context_project.h       |  9 +++----
 paddle/operators/sequence_conv_op.cc          | 26 +++++++++----------
 paddle/operators/sequence_conv_op.h           | 16 ++++++------
 .../v2/framework/tests/test_seq_conv.py       |  8 +++---
 4 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
index e37f3a5bf2..b7466d206e 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
@@ -34,18 +34,15 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
  * \param in            Input data.
  * \param Shape         The shape of Input data,
- *                      [minibatch, number_of_input_features].
- * \param type          A float LoDTensor.
+ *                      [minibatch, input_hidden_size].
  *
  * \param padding_data  Padding data.
  * \param Shape         The shape of Padding data,
- *                      [up_pad + down_pad, number_of_input_features].
- * \param type          A float Tensor.
+ *                      [up_pad + down_pad, input_hidden_size].
  *
  * \param col           Col data.
  * \param Shape         The shape of Col data,
- *                      [minibatch, context_length * number_of_input_features].
- * \param type           A float Tensor.
+ *                      [minibatch, context_length * input_hidden_size].
  *
  * For a mini-batch of 2 variable lengths sentences, containing 3, and 1
  * time-steps:
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index 139000c561..a73ceb4157 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -30,9 +30,9 @@ class SequenceConvOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SequenceConvOp should not be null.");
 
-    int context_length = ctx->Attrs().Get<int>("context_length");
-    bool padding_trainable = ctx->Attrs().Get<bool>("padding_trainable");
-    int context_start = ctx->Attrs().Get<int>("context_start");
+    int context_length = ctx->Attrs().Get<int>("contextLength");
+    bool padding_trainable = ctx->Attrs().Get<bool>("paddingTrainable");
+    int context_start = ctx->Attrs().Get<int>("contextStart");
 
     auto in_dims = ctx->GetInputDim("X");
     auto filter_dims = ctx->GetInputDim("Filter");
@@ -54,7 +54,7 @@ class SequenceConvOp : public framework::OperatorWithKernel {
 
       if (context_start == 0 && context_length == 1) {
         PADDLE_THROW(
-            "If context_start is 0 and context_length is 1, padding_trainable "
+            "If context_start is 0 and context_length is 1, paddingTrainable "
             "should be false.");
       }
       PADDLE_ENFORCE(padding_dim.size() == 2,
@@ -81,7 +81,7 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {
                    "Gradient of output(Out) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("X"), "The input(X) should not be null.");
 
-    if (ctx->Attrs().Get<bool>("padding_trainable") &&
+    if (ctx->Attrs().Get<bool>("paddingTrainable") &&
         ctx->HasOutput(framework::GradVarName("PaddingData"))) {
       ctx->SetOutputDim(framework::GradVarName("PaddingData"),
                         ctx->GetInputDim("PaddingData"));
@@ -128,25 +128,25 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
         "this LoDTensor is a matrix with shape (T, D), where, T is the "
         "total time steps in this mini-batch, D is the output feature size.");
 
-    AddAttr<bool>("padding_trainable",
+    AddAttr<bool>("paddingTrainable",
                   "(bool, default false) the padding data of SequenceConvOp "
                   "is trainable or not.")
         .SetDefault(false);
-    AddAttr<int>("context_length",
-                 "(int, default 3) the context_length of SequenceConvOp is the "
+    AddAttr<int>("contextLength",
+                 "(int, default 3) the contextLength of SequenceConvOp is the "
                  "height of the convolution kernel.")
         .SetDefault(3)
         .GreaterThan(0);
-    AddAttr<int>("context_start",
-                 "(int, default 0) the context_start of SequenceConvOp "
+    AddAttr<int>("contextStart",
+                 "(int, default 0) the contextStart of SequenceConvOp "
                  "represents the beginning of the convolution of the number of "
                  "rows of sequence, which can be negative.")
         .SetDefault(0);
-    AddAttr<int>("context_stride",
-                 "(int, default 1) the context_stride of SequenceConvOp "
+    AddAttr<int>("contextStride",
+                 "(int, default 1) the contextStride of SequenceConvOp "
                  "represents the step length of convolution. "
                  "Currently, SequenceConvOp only supports"
-                 "context_stride=1.")
+                 "contextStride=1.")
         .SetDefault(1)
         .GreaterThan(0);
 
diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h
index cd8a8d4cea..c502601b38 100644
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
@@ -35,10 +35,10 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(context.GetPlace());
     context.ShareLoD("X", "Out");
 
-    int context_start = context.Attr<int>("context_start");
-    int context_length = context.Attr<int>("context_length");
-    int context_stride = context.Attr<int>("context_stride");
-    bool padding_trainable = context.Attr<bool>("padding_trainable");
+    int context_start = context.Attr<int>("contextStart");
+    int context_length = context.Attr<int>("contextLength");
+    int context_stride = context.Attr<int>("contextStride");
+    bool padding_trainable = context.Attr<bool>("paddingTrainable");
 
     // InferShape by in_lod
     PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
@@ -89,10 +89,10 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     auto* in = context.Input<LoDTensor>("X");
     auto* filter = context.Input<Tensor>("Filter");
 
-    int context_start = context.Attr<int>("context_start");
-    int context_length = context.Attr<int>("context_length");
-    int context_stride = context.Attr<int>("context_stride");
-    bool padding_trainable = context.Attr<bool>("padding_trainable");
+    int context_start = context.Attr<int>("contextStart");
+    int context_length = context.Attr<int>("contextLength");
+    int context_stride = context.Attr<int>("contextStride");
+    bool padding_trainable = context.Attr<bool>("paddingTrainable");
 
     PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
                       "Only support one level sequence now.");
diff --git a/python/paddle/v2/framework/tests/test_seq_conv.py b/python/paddle/v2/framework/tests/test_seq_conv.py
index f0337c20a9..14edc5f953 100644
--- a/python/paddle/v2/framework/tests/test_seq_conv.py
+++ b/python/paddle/v2/framework/tests/test_seq_conv.py
@@ -45,10 +45,10 @@ class TestSeqProject(OpTest):
             self.inputs_val_no_f = ['PaddingData', 'X']
 
         self.attrs = {
-            'context_start': self.context_start,
-            'context_length': self.context_length,
-            'padding_trainable': self.padding_trainable,
-            'context_stride': self.context_stride
+            'contextStart': self.context_start,
+            'contextLength': self.context_length,
+            'paddingTrainable': self.padding_trainable,
+            'contextStride': self.context_stride
         }
         out = np.zeros(
             (self.input_size[0], self.output_represention)).astype('float32')

From 6bc261b9330b1bb810e970e20cdce56b3d40f492 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Thu, 26 Oct 2017 09:15:15 -0700
Subject: [PATCH 42/81] fix ci

---
 paddle/operators/nccl/nccl_gpu_common.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index eead7f79b7..0d71eddf02 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -34,6 +34,8 @@ struct Communicator {
   std::vector<ncclComm_t> comms_;
   std::unordered_map<int, int> comm_id_map_;
 
+  Communicator() {}
+
   int GetCommId(int device_id) const { return comm_id_map_.at(device_id); }
 
   void InitAll(const std::vector<int>& gpus) {

From dbfd1302e1486939b33b79b2485b0889f5cc2994 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Thu, 26 Oct 2017 10:57:52 -0700
Subject: [PATCH 43/81] "FIX CI"

---
 paddle/pybind/pybind.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index e1e382b2bb..9288468a03 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
 #include "paddle/operators/dynamic_recurrent_op.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
@@ -33,6 +32,10 @@ limitations under the License. */
 #include "paddle/pybind/tensor_py.h"
 #include "paddle/string/to_string.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+#endif
+
 namespace paddle {
 namespace pybind {
 static size_t UniqueIntegerGenerator() {

From 6cce5268ed7a9096a5706230c1acdca626818bf3 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Thu, 26 Oct 2017 11:31:13 -0700
Subject: [PATCH 44/81] "fixed based on comment"

---
 paddle/framework/operator.h             |  5 +++--
 paddle/operators/nccl/nccl_gpu_common.h |  2 ++
 paddle/operators/nccl_op.cc             | 26 +++++++++++++------------
 paddle/operators/nccl_op.cu             | 21 ++++++++++++++++++--
 4 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 09989c374c..3236250366 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -290,11 +290,12 @@ class ExecutionContext {
     return device_context_;
   }
 
-  //! Get a input which has multiple variables.
+  //! Get variables vector with same input name.
   const std::vector<std::string>& Inputs(const std::string& name) const {
     return op_.Inputs(name);
   }
-  //! Get an output which has multiple variables.
+
+  //! Get variables vector with same output name.
   const std::vector<std::string>& Outputs(const std::string& name) const {
     return op_.Outputs(name);
   }
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 0d71eddf02..5858cd4839 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -30,6 +30,8 @@
 namespace paddle {
 namespace platform {
 
+constexpr int kInvalidGPUId = -1;
+
 struct Communicator {
   std::vector<ncclComm_t> comms_;
   std::unordered_map<int, int> comm_id_map_;
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 6a0589cb20..4f3a2f2768 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -69,10 +69,10 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputsDim("X");
 
-    // std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    // PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
-    //                 reduction == "ncclMin" || reduction == "ncclMax"),
-    //                "invalid reduction.");
+    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
+    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+                    reduction == "ncclMin" || reduction == "ncclMax"),
+                   "invalid reduction.");
 
     ctx->SetOutputsDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -115,7 +115,7 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
                    " Output(Out) of Bcast op output should not be NULL");
 
     int root = ctx->Attrs().Get<int>("root");
-    PADDLE_ENFORCE(root != -1, "Bcast root must be set.");
+    PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
 
     auto x_dims = ctx->GetInputsDim("X");
     ctx->SetOutputsDim("Out", x_dims);
@@ -132,9 +132,9 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The input of AllReduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
-    // AddAttr<std::string>("reduction",
-    //                      "{'ncclmin', 'ncclmax', 'ncclprod', 'ncclsum'}.");
-    // AddAttr<std::vector<int>>("gpus", "gpu id lists");
+    AddAttr<std::string>("reduction",
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
     AddComment(R"DOC(
             AllReduce the input tensors.
         )DOC");
@@ -151,8 +151,9 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Reduce op");
     AddAttr<int>("root",
-                 "root gpu of the parameter. if not set(-1). hashed by name.")
-        .SetDefault(-1);
+                 "root gpu of the parameter. if not "
+                 "set(platform::kInvalidGPUId). hashed by name.")
+        .SetDefault(platform::kInvalidGPUId);
     AddComment(R"DOC(
             Reduce the tensors)DOC");
   }
@@ -168,8 +169,9 @@ class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Bcast");
     AddAttr<int>("root",
-                 "root gpu of the parameter. if not set(-1). hashed by name.")
-        .SetDefault(-1);
+                 "root gpu of the parameter. if not "
+                 "set(platform::kInvalidGPUId). hashed by name.")
+        .SetDefault(platform::kInvalidGPUId);
     AddComment(R"DOC(
             Bcast the tensors.
         )DOC");
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index 1eef2f218f..cc01db80ca 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -48,11 +48,28 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     auto ins = ctx.MultiInput<LoDTensor>("X");
     auto outs = ctx.MultiOutput<LoDTensor>("Out");
 
+    std::string reduction = ctx.Attr<std::string>("reduction");
+
+    ncclRedOp_t reduction_op_ = ncclSum;
+
+    if (reduction == "ncclMin") {
+      reduction_op_ = ncclMin;
+    } else if (reduction == "ncclMax") {
+      reduction_op_ = ncclMax;
+    } else if (reduction == "ncclSum") {
+      reduction_op_ = ncclSum;
+    } else if (reduction == "ncclProd") {
+      reduction_op_ = ncclProd;
+    } else {
+      PADDLE_ENFORCE(false, "Invalid reduction. default ncclSum.");
+    }
+
     auto* comm = ctx.Input<Communicator>("Communicator");
 
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                       ctx.device_context())
                       .stream();
+
     // device id
     int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
@@ -64,7 +81,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
 
       PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
           ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
-          outs[i]->numel(), NCCLTypeWrapper<T>::type, ncclSum,
+          outs[i]->numel(), NCCLTypeWrapper<T>::type, reduction_op_,
           comm->comms_[idx], stream));
       PADDLE_ENFORCE(cudaStreamSynchronize(stream));
 
@@ -98,7 +115,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     auto ins_names = ctx.Inputs("X");
     std::hash<std::string> hasher;
     for (size_t i = 0; i < ins.size(); ++i) {
-      if (root == -1) {
+      if (root == platform::kInvalidGPUId) {
         root = hasher(ins_names[i]) % comm->comms_.size();
       }
       T* recvbuffer = nullptr;

From 52200523d61ca4b77a37d2a3d53312bca52c5cb1 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Thu, 26 Oct 2017 11:39:09 -0700
Subject: [PATCH 45/81] "polish code based on comment"

---
 paddle/operators/nccl_op.cc |  8 ++++++++
 paddle/operators/nccl_op.cu | 21 ++++++++++++++++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 4f3a2f2768..3744d1b470 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -94,6 +94,11 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    " Input(X) of Reduce op input should not be NULL");
 
+    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
+    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+                    reduction == "ncclMin" || reduction == "ncclMax"),
+                   "invalid reduction.");
+
     auto x_dims = ctx->GetInputsDim("X");
     ctx->SetOutputsDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -150,6 +155,9 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The input of Reduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Reduce op");
+    AddAttr<std::string>("reduction",
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
     AddAttr<int>("root",
                  "root gpu of the parameter. if not "
                  "set(platform::kInvalidGPUId). hashed by name.")
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index cc01db80ca..f8b3b8a8ba 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -49,7 +49,6 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     auto outs = ctx.MultiOutput<LoDTensor>("Out");
 
     std::string reduction = ctx.Attr<std::string>("reduction");
-
     ncclRedOp_t reduction_op_ = ncclSum;
 
     if (reduction == "ncclMin") {
@@ -101,8 +100,23 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
 
     auto ins = ctx.MultiInput<LoDTensor>("X");  // x0, x1, x2
     auto outs = ctx.MultiOutput<LoDTensor>("Out");
-    int root = ctx.Attr<int>("root");
 
+    std::string reduction = ctx.Attr<std::string>("reduction");
+    ncclRedOp_t reduction_op_ = ncclSum;
+
+    if (reduction == "ncclMin") {
+      reduction_op_ = ncclMin;
+    } else if (reduction == "ncclMax") {
+      reduction_op_ = ncclMax;
+    } else if (reduction == "ncclSum") {
+      reduction_op_ = ncclSum;
+    } else if (reduction == "ncclProd") {
+      reduction_op_ = ncclProd;
+    } else {
+      PADDLE_ENFORCE(false, "Invalid reduction. default ncclSum.");
+    }
+
+    int root = ctx.Attr<int>("root");
     auto* comm = ctx.Input<Communicator>("Communicator");
 
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
@@ -128,7 +142,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
 
       PADDLE_ENFORCE(platform::dynload::ncclReduce(
           ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
-          NCCLTypeWrapper<T>::type, ncclSum, root, comm->comms_[idx], stream));
+          NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms_[idx],
+          stream));
       PADDLE_ENFORCE(cudaStreamSynchronize(stream));
 
       VLOG(1) << "gpu : " << gpu_id << " finished reduce. send "

From f632706c18ee926700ad3fbf73d4952ed648c395 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Thu, 26 Oct 2017 15:09:14 -0700
Subject: [PATCH 46/81] fix based on comment

---
 paddle/pybind/pybind.cc                               | 2 ++
 python/paddle/v2/framework/tests/test_nccl_init_op.py | 7 +++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 9288468a03..35fbf4d04a 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/gpu_info.h"
 #endif
 
 namespace paddle {
@@ -482,6 +483,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindOpDesc(m);
 
   m.def("op_support_gpu", OpSupportGPU);
+  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
 
   return m.ptr();
 }
diff --git a/python/paddle/v2/framework/tests/test_nccl_init_op.py b/python/paddle/v2/framework/tests/test_nccl_init_op.py
index 8aed14c15d..03d46d1c60 100644
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_init_op.py
@@ -5,11 +5,10 @@ from paddle.v2.framework.op import Operator
 import paddle.v2.framework.core as core
 from op_test import OpTest, create_op, set_input
 
-gpu_list = "0,1,2,3"
-
-if not core.is_compile_gpu() or not gpu_list:
+if not core.is_compile_gpu():
     exit(0)
 
+gpu_count = core.get_cuda_device_count
 g_scope = core.Scope()
 g_ctx = core.DeviceContext.create(core.CPUPlace())
 
@@ -17,7 +16,7 @@ g_ctx = core.DeviceContext.create(core.CPUPlace())
 class TestNCCLInit(unittest.TestCase):
     def test_init(self):
         self.op_type = "ncclInit"
-        self.gpus = [int(g) for g in gpu_list.split(",")]
+        self.gpus = [int(g) for g in range(gpu_count)]
 
         self.inputs = {}
         self.attrs = {"gpus": self.gpus}

From 75eacccd5c011421422f538e59d9a0aa4ed47b05 Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Thu, 26 Oct 2017 15:14:06 -0700
Subject: [PATCH 47/81] "rerun ci"

---
 python/paddle/v2/framework/tests/test_nccl_init_op.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_nccl_init_op.py b/python/paddle/v2/framework/tests/test_nccl_init_op.py
index 03d46d1c60..9fd4b3e07c 100644
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_init_op.py
@@ -8,7 +8,7 @@ from op_test import OpTest, create_op, set_input
 if not core.is_compile_gpu():
     exit(0)
 
-gpu_count = core.get_cuda_device_count
+gpu_count = core.get_cuda_device_count()
 g_scope = core.Scope()
 g_ctx = core.DeviceContext.create(core.CPUPlace())
 
@@ -16,7 +16,7 @@ g_ctx = core.DeviceContext.create(core.CPUPlace())
 class TestNCCLInit(unittest.TestCase):
     def test_init(self):
         self.op_type = "ncclInit"
-        self.gpus = [int(g) for g in range(gpu_count)]
+        self.gpus = range(gpu_count)
 
         self.inputs = {}
         self.attrs = {"gpus": self.gpus}

From 37842d802d7b283c5f6de52d0f9b007e0ae83a8d Mon Sep 17 00:00:00 2001
From: Dong Zhihong <dzhwinter@gmail.com>
Date: Thu, 26 Oct 2017 15:33:54 -0700
Subject: [PATCH 48/81] rerun ci

---
 paddle/pybind/pybind.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 35fbf4d04a..bc87fabf3f 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -483,7 +483,9 @@ All parameter, weight, gradient are variables in Paddle.
   BindOpDesc(m);
 
   m.def("op_support_gpu", OpSupportGPU);
+#ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+#endif
 
   return m.ptr();
 }

From b9edcc4a1b4f2c12e878169b21abcb4b4aab3fae Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 27 Oct 2017 11:12:15 +0800
Subject: [PATCH 49/81] sss

---
 paddle/operators/math/context_project.h | 161 +++++++++++++++++++-----
 paddle/operators/sequence_conv_op.h     |  32 +++--
 2 files changed, 141 insertions(+), 52 deletions(-)

diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
index b7466d206e..7d9cdab2cf 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
@@ -31,6 +31,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
  * a sequence. The i-th row of the output is the concatenation of
  * context_length rows of the input. The context_length rows are the
  * consecutive rows from the i+shift_start row.
+ * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor.
 
  * \param in            Input data.
  * \param Shape         The shape of Input data,
@@ -85,16 +86,126 @@ template <typename Place, typename T>
 class ContextProjectFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::LoDTensor& in, framework::Tensor& padding_data,
-                  framework::Tensor& col, bool padding_trainable,
-                  int context_start, int context_length, int context_stride,
-                  int up_pad, int down_pad, bool gradient, bool input_grad,
-                  bool pad_grad) {
+                  const framework::LoDTensor& in,
+                  const framework::Tensor& padding_data, framework::Tensor& col,
+                  bool padding_trainable, int context_start, int context_length,
+                  int context_stride, int up_pad, int down_pad) {
     auto lod_level_0 = in.lod()[0];
 
     paddle::operators::math::Im2ColFunctor<
         paddle::operators::math::ColFormat::kOCF, Place, float>
         im2col_ocf;
+
+    int input_row_begin, input_row_end;
+    int sequence_height, sequence_width;
+    sequence_width = in.dims()[1];
+
+    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+      input_row_begin = (context_start > 0)
+                            ? static_cast<int>(lod_level_0[i]) + context_start
+                            : static_cast<int>(lod_level_0[i]);
+      input_row_end = static_cast<int>(lod_level_0[i + 1]);
+
+      framework::Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]),
+                                          static_cast<int>(lod_level_0[i + 1]));
+
+      sequence_height = static_cast<int>(out_t.dims()[0]);
+
+      if (input_row_begin < input_row_end) {
+        framework::Tensor in_t = in.Slice(input_row_begin, input_row_end);
+
+        std::vector<int64_t> output_shape(
+            {sequence_height, 1, 1, context_length,
+             sequence_width});  // output_height, output_width,
+        // input_channels, filter_height, filter_width
+
+        out_t.Resize(framework::make_ddim(output_shape));
+
+        std::vector<int64_t> input_shape(
+            {1, input_row_end - input_row_begin,
+             sequence_width});  // input_channels, input_height, input_width
+        in_t.Resize(framework::make_ddim(input_shape));
+
+        im2col_ocf(context, in_t, out_t,
+                   /*stride_height*/ context_stride, /*stride_width*/ 1, up_pad,
+                   down_pad, 0, 0);
+        out_t.Resize({sequence_height, context_length * sequence_width});
+      }
+    }
+    if (padding_trainable) {
+      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        framework::Tensor out_t =
+            col.Slice(static_cast<int>(lod_level_0[i]),
+                      static_cast<int>(lod_level_0[i + 1]));
+
+        sequence_height = static_cast<int>(out_t.dims()[0]);
+
+        // add up trainable data
+        out_t.Resize({sequence_height * context_length, sequence_width});
+
+        if (up_pad > 0) {  // add up pad
+          int padding_rows = std::min(
+              up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
+
+          for (int k = 0; k < padding_rows; ++k) {
+            int padding_size =
+                k + context_length < up_pad ? context_length : up_pad - k;
+            framework::Tensor out_t_sub = out_t.Slice(
+                k * context_length, k * context_length + padding_size);
+            framework::Tensor w_sub = padding_data.Slice(k, k + padding_size);
+            // in this block, using EigenVector<T>::Flatten is ok too.
+            auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
+            auto w_sub_e = EigenMatrix<T>::From(w_sub);
+            out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
+          }
+        }
+        if (down_pad > 0) {  // add down pad
+          int down_pad_begin_row =
+              std::max(0,
+                       (sequence_height - context_start - context_length) + 1) +
+              1;
+          int padding_begin = std::max(0, context_start - sequence_height);
+          int padding_size =
+              sequence_height - context_start >= context_length
+                  ? 1
+                  : context_length - (sequence_height - context_start);
+          if (context_start >= sequence_height) padding_size = context_length;
+          int padding_idx = padding_begin;
+          for (int t = 0; t + down_pad_begin_row <= sequence_height;
+               ++t, ++padding_size) {
+            if (context_start >= sequence_height) padding_size = context_length;
+            if (padding_size > context_length) {
+              padding_size = context_length;
+              padding_idx++;
+            }
+            if (padding_begin > 0 || sequence_height == context_start)
+              padding_idx = padding_begin + t;
+            framework::Tensor out_t_sub = out_t.Slice(
+                (down_pad_begin_row + t) * context_length - padding_size,
+                (down_pad_begin_row + t) * context_length);
+            framework::Tensor w_sub = padding_data.Slice(
+                up_pad + padding_idx, up_pad + padding_idx + padding_size);
+            auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
+            auto w_sub_e = EigenMatrix<T>::From(w_sub);
+            out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
+          }
+        }
+        out_t.Resize({sequence_height, context_length * sequence_width});
+      }
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ContextProjectGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::LoDTensor& in, framework::Tensor& padding_data,
+                  framework::Tensor& col, bool padding_trainable,
+                  int context_start, int context_length, int context_stride,
+                  int up_pad, int down_pad, bool input_grad, bool pad_grad) {
+    auto lod_level_0 = in.lod()[0];
+
     paddle::operators::math::Col2ImFunctor<
         paddle::operators::math::ColFormat::kOCF, Place, float>
         col2im_ocf;
@@ -102,10 +213,8 @@ class ContextProjectFunctor {
     int input_row_begin, input_row_end;
     int sequence_height, sequence_width;
     sequence_width = in.dims()[1];
-    input_grad = gradient && input_grad;
-    pad_grad = gradient && pad_grad;
 
-    if (!gradient || input_grad) {
+    if (input_grad) {
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
         input_row_begin = (context_start > 0)
                               ? static_cast<int>(lod_level_0[i]) + context_start
@@ -133,20 +242,14 @@ class ContextProjectFunctor {
                sequence_width});  // input_channels, input_height, input_width
           in_t.Resize(framework::make_ddim(input_shape));
 
-          if (gradient) {
-            col2im_ocf(context, in_t, out_t,
-                       /*stride_height*/ context_stride, /*stride_width*/ 1,
-                       up_pad, down_pad, 0, 0);
-          } else {
-            im2col_ocf(context, in_t, out_t,
-                       /*stride_height*/ context_stride, /*stride_width*/ 1,
-                       up_pad, down_pad, 0, 0);
-          }
+          col2im_ocf(context, in_t, out_t,
+                     /*stride_height*/ context_stride, /*stride_width*/ 1,
+                     up_pad, down_pad, 0, 0);
           out_t.Resize({sequence_height, context_length * sequence_width});
         }
       }
     }
-    if (!gradient || pad_grad) {
+    if (pad_grad) {
       if (padding_trainable) {
         for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
           framework::Tensor out_t =
@@ -154,11 +257,9 @@ class ContextProjectFunctor {
                         static_cast<int>(lod_level_0[i + 1]));
 
           sequence_height = static_cast<int>(out_t.dims()[0]);
-
-          // add up trainable data
           out_t.Resize({sequence_height * context_length, sequence_width});
 
-          if (up_pad > 0) {  // add up pad
+          if (up_pad > 0) {
             int padding_rows = std::min(
                 up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
 
@@ -171,15 +272,11 @@ class ContextProjectFunctor {
               // in this block, using EigenVector<T>::Flatten is ok too.
               auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
               auto w_sub_e = EigenMatrix<T>::From(w_sub);
-              if (gradient) {
-                w_sub_e.device(*context.GetEigenDevice<Place>()) =
-                    w_sub_e + out_t_sub_e;
-              } else {
-                out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
-              }
+              w_sub_e.device(*context.GetEigenDevice<Place>()) =
+                  w_sub_e + out_t_sub_e;
             }
           }
-          if (down_pad > 0) {  // add down pad
+          if (down_pad > 0) {
             int down_pad_begin_row =
                 std::max(
                     0, (sequence_height - context_start - context_length) + 1) +
@@ -208,12 +305,8 @@ class ContextProjectFunctor {
                   up_pad + padding_idx, up_pad + padding_idx + padding_size);
               auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
               auto w_sub_e = EigenMatrix<T>::From(w_sub);
-              if (gradient) {
-                w_sub_e.device(*context.GetEigenDevice<Place>()) =
-                    w_sub_e + out_t_sub_e;
-              } else {
-                out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
-              }
+              w_sub_e.device(*context.GetEigenDevice<Place>()) =
+                  w_sub_e + out_t_sub_e;
             }
           }
           out_t.Resize({sequence_height, context_length * sequence_width});
diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h
index c502601b38..5727238c0d 100644
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
@@ -65,12 +65,10 @@ class SequenceConvKernel : public framework::OpKernel<T> {
 
     paddle::operators::math::ContextProjectFunctor<Place, T>
         seq_project_functor;
-    LoDTensor* input = const_cast<LoDTensor*>(in);
-    Tensor* pad_data = const_cast<Tensor*>(padding_data);
 
-    seq_project_functor(context.device_context(), *input, *pad_data, col,
+    seq_project_functor(context.device_context(), *in, *padding_data, col,
                         padding_trainable, context_start, context_length,
-                        context_stride, up_pad, down_pad, false, false, false);
+                        context_stride, up_pad, down_pad);
 
     math::matmul<Place, T>(context.device_context(), col, false, filter, false,
                            static_cast<T>(1.0), out, static_cast<T>(0.0));
@@ -117,15 +115,18 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     }
     paddle::operators::math::ContextProjectFunctor<Place, T>
         seq_project_functor;
+    paddle::operators::math::ContextProjectGradFunctor<Place, T>
+        seq_project_grad_functor;
 
     if (in_g) {
       in_g->mutable_data<T>(context.GetPlace());
       in_g->set_lod(in->lod());
       set_zero(context.device_context(), in_g, static_cast<T>(0));
 
-      seq_project_functor(context.device_context(), *in_g, *padding_data_g, col,
-                          padding_trainable, context_start, context_length,
-                          context_stride, up_pad, down_pad, true, true, false);
+      seq_project_grad_functor(context.device_context(), *in_g, *padding_data_g,
+                               col, padding_trainable, context_start,
+                               context_length, context_stride, up_pad, down_pad,
+                               true, false);
     }
 
     if (padding_trainable && padding_data_g) {
@@ -133,9 +134,10 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
       set_zero(context.device_context(), padding_data_g, static_cast<T>(0));
 
       LoDTensor* input = const_cast<LoDTensor*>(in);
-      seq_project_functor(context.device_context(), *input, *padding_data_g,
-                          col, padding_trainable, context_start, context_length,
-                          context_stride, up_pad, down_pad, true, false, true);
+      seq_project_grad_functor(context.device_context(), *input,
+                               *padding_data_g, col, padding_trainable,
+                               context_start, context_length, context_stride,
+                               up_pad, down_pad, false, true);
     }
 
     if (filter_g) {
@@ -150,15 +152,9 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
         padding_data = context.Input<Tensor>("PaddingData");
       }
 
-      sequence_width = static_cast<int>(in->dims()[1]);
-
-      LoDTensor* input = const_cast<LoDTensor*>(in);
-      Tensor* pad_data = const_cast<Tensor*>(padding_data);
-
-      seq_project_functor(context.device_context(), *input, *pad_data, col,
+      seq_project_functor(context.device_context(), *in, *padding_data, col,
                           padding_trainable, context_start, context_length,
-                          context_stride, up_pad, down_pad, false, false,
-                          false);
+                          context_stride, up_pad, down_pad);
 
       math::matmul<Place, T>(context.device_context(), col, true, out_grad,
                              false, T(1.0), &filter_grad, T(1.0));

From 99308b1876b79aa4157767d34716095f54acb20d Mon Sep 17 00:00:00 2001
From: dong zhihong <dzhwinter@gmail.com>
Date: Sat, 28 Oct 2017 06:40:37 +0800
Subject: [PATCH 50/81] rerun CI

---
 python/paddle/v2/framework/tests/test_nccl_init_op.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/v2/framework/tests/test_nccl_init_op.py b/python/paddle/v2/framework/tests/test_nccl_init_op.py
index 9fd4b3e07c..b56a857a98 100644
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_init_op.py
@@ -9,6 +9,10 @@ if not core.is_compile_gpu():
     exit(0)
 
 gpu_count = core.get_cuda_device_count()
+
+if gpu_count <= 1:
+    exit(1)
+
 g_scope = core.Scope()
 g_ctx = core.DeviceContext.create(core.CPUPlace())
 

From 6f009cf8ba7a2ae7221ebfa9129c2a05abf49b0d Mon Sep 17 00:00:00 2001
From: dong zhihong <dzhwinter@gmail.com>
Date: Sat, 28 Oct 2017 06:43:21 +0800
Subject: [PATCH 51/81] rerun ci

---
 python/paddle/v2/framework/tests/test_nccl_init_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/framework/tests/test_nccl_init_op.py b/python/paddle/v2/framework/tests/test_nccl_init_op.py
index b56a857a98..054909fdf5 100644
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_init_op.py
@@ -11,7 +11,7 @@ if not core.is_compile_gpu():
 gpu_count = core.get_cuda_device_count()
 
 if gpu_count <= 1:
-    exit(1)
+    exit(0)
 
 g_scope = core.Scope()
 g_ctx = core.DeviceContext.create(core.CPUPlace())

From 6bdf5c141739a845b8993d4d9dbc3000b4f9978e Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Sat, 28 Oct 2017 09:35:10 +0800
Subject: [PATCH 52/81] fix bug

---
 paddle/operators/pool_cudnn_op.cu             |  5 +-
 paddle/operators/pool_op.cc                   | 45 +++++++------
 paddle/operators/pool_op.h                    |  7 +-
 paddle/operators/pool_with_index_op.cc        | 65 +++++++++++--------
 paddle/operators/pool_with_index_op.h         |  4 ++
 .../v2/framework/tests/test_pool2d_op.py      |  5 +-
 .../v2/framework/tests/test_pool3d_op.py      | 19 +++---
 .../v2/framework/tests/test_pool_max_op.py    | 34 +++++-----
 8 files changed, 109 insertions(+), 75 deletions(-)

diff --git a/paddle/operators/pool_cudnn_op.cu b/paddle/operators/pool_cudnn_op.cu
index bc29be18e7..8d0741dccc 100644
--- a/paddle/operators/pool_cudnn_op.cu
+++ b/paddle/operators/pool_cudnn_op.cu
@@ -43,6 +43,7 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     if (ctx.Attr<bool>("globalPooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
         ksize[i] = static_cast<int>(input->dims()[i + 2]);
       }
     }
@@ -97,8 +98,10 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
 
     if (ctx.Attr<bool>("globalPooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i)
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
         ksize[i] = static_cast<int>(input->dims()[i + 2]);
+      }
     }
 
     const T *input_data = input->data<T>();
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index c4ab29e4d5..4d75c11bc8 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -39,8 +39,10 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 
   if (ctx->Attrs().Get<bool>("globalPooling")) {
     ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-    for (size_t i = 0; i < ksize.size(); ++i)
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
       ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+    }
   }
 
   PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
@@ -84,15 +86,16 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
                        "(string), pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
-  AddAttr<std::vector<int>>(
-      "ksize",
-      "(vector ), the pooling window size(height, width) of pooling operator."
-      "If globalPooling = true, ksize is ignored and need not be "
-      "specified.");  // TODO(Chengduo): Add checker. (Currently,
+  AddAttr<std::vector<int>>("ksize",
+                            "(vector ), the pooling window size(height, width) "
+                            "of pooling operator."
+                            "If globalPooling = true, ksize and paddings will "
+                            "be ignored.");  // TODO(Chengduo): Add checker.
+                                             // (Currently,
   // TypedAttrChecker don't support vector type.)
   AddAttr<bool>("globalPooling",
                 "(bool default: false), whether to use the global pooling."
-                "If globalPooling = true, ksize is ignored.")
+                "If globalPooling = true, ksize and paddings will be ignored.")
       .SetDefault(false);
   AddAttr<std::vector<int>>(
       "strides",
@@ -101,7 +104,8 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
   // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector defalut:{0,0}), paddings(height, width) of pooling operator.")
+      "(vector defalut:{0,0}), paddings(height, width) of pooling operator."
+      "If globalPooling = true, paddings and ksize will be ignored.")
       .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
   // TypedAttrChecker don't support vector type.)
 
@@ -145,25 +149,28 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
                        "(string), pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
-  AddAttr<std::vector<int>>(
-      "ksize",
-      "(vector ), the pooling window size(depth, height, width) of pooling "
-      "operator."
-      "If globalPooling = true, ksize is ignored and need not be "
-      "specified.");  // TODO(Chengduo): Add checker. (Currently,
-                      // TypedAttrChecker don't support vector type.)
+  AddAttr<std::vector<int>>("ksize",
+                            "(vector ), the pooling window size(depth, height, "
+                            "width) of pooling "
+                            "operator."
+                            "If globalPooling = true, ksize and paddings wille "
+                            "be ignored.");  // TODO(Chengduo): Add checker.
+                                             // (Currently,
+  // TypedAttrChecker don't support vector type.)
   AddAttr<bool>("globalPooling",
                 "(bool default: false), whether to use the global pooling."
-                "If globalPooling = true, ksize is ignored.")
+                "If globalPooling = true, ksize and paddings wille be ignored.")
       .SetDefault(false);
   AddAttr<std::vector<int>>("strides",
                             "(vector, default:{1,1,1}), strides(depth, height, "
                             "width) of pooling operator.")
       .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
-  AddAttr<std::vector<int>>("paddings",
-                            "(vector defalut:{0,0,0}), paddings(depth, height, "
-                            "width) of pooling operator.")
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector defalut:{0,0,0}), paddings(depth, height, "
+      "width) of pooling operator."
+      "If globalPooling = true, ksize and paddings wille be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
 
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index ba8edc9cf6..d9d445f6a6 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -63,6 +63,7 @@ class PoolKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     if (context.Attr<bool>("globalPooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
     }
@@ -103,6 +104,7 @@ class PoolKernel : public framework::OpKernel<T> {
                          paddings, pool_process);
         }
       } break;
+      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
     }
   }
 };
@@ -123,8 +125,10 @@ class PoolGradKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
     if (context.Attr<bool>("globalPooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i)
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
     }
 
     if (in_x_grad) {
@@ -164,6 +168,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
                             *out_grad, ksize, strides, paddings, pool_process);
           }
         } break;
+        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
       }
     }
   }
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index ea21845751..95e896e7cc 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -46,8 +46,10 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
 
     if (ctx->Attrs().Get<bool>("globalPooling")) {
       ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-      for (size_t i = 0; i < ksize.size(); ++i)
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+      }
     }
 
     PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
@@ -87,31 +89,33 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
-        "(Tensor) The input tensor of pooling operator. "
+        "(Tensor), the input tensor of pooling operator. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of image.");
     AddOutput("Out",
-              "(Tensor) The output tensor of pooling operator."
+              "(Tensor), the output tensor of pooling operator."
               "The format of output tensor is also NCHW."
               "Where N is batch size, C is "
               "the number of channels, H and W is the height and "
               "width of image.");
     AddOutput("Mask",
-              "(Tensor) The Mask tensor of pooling operator."
+              "(Tensor), the Mask tensor of pooling operator."
               "The format of output tensor is also NCHW."
               "Where N is batch size, C is the number of channels, H and W "
               "is the height and width of image."
               "The value in it is the index in current feature map");
 
-    AddAttr<std::vector<int>>(
-        "ksize",
-        "(vector ), the pooling window size(height, width) of pooling operator."
-        "If globalPooling = true, ksize is ignored and need not be "
-        "specified.");  // TODO(Chengduo): Add checker. (Currently,
+    AddAttr<std::vector<int>>("ksize",
+                              "(vector ), the pooling window size(height, "
+                              "width) of pooling operator."
+                              "If globalPooling = true, ksize and paddings "
+                              "will be ignored.");  // TODO(Chengduo): Add
+                                                    // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
-    AddAttr<bool>("globalPooling",
-                  "(bool default: false), whether to use the global pooling."
-                  "If globalPooling = true, ksize is ignored.")
+    AddAttr<bool>(
+        "globalPooling",
+        "(bool default: false), whether to use the global pooling."
+        "If globalPooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
     AddAttr<std::vector<int>>(
         "strides",
@@ -120,7 +124,8 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector defalut:{0,0}), paddings(height, width) of pooling operator.")
+        "(vector defalut:{0, 0}), paddings(height, width) of pooling operator."
+        "If globalPooling = true, paddings and will be ignored.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
@@ -153,42 +158,46 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
-        "(Tensor) The input tensor of pooling operator. "
+        "(Tensor), the input tensor of pooling operator. "
         "The format of input tensor is NCDHW. Where N is batch size, C is "
         "the number of channels, D, H and W is the depth, height and width of "
         "image.");
     AddOutput("Out",
-              "(Tensor) The output tensor of pooling operator."
+              "(Tensor), the output tensor of pooling operator."
               "The format of output tensor is also NCDHW."
               "Where N is batch size, C is "
               "the number of channels, D, H and W is the depth, height and "
               "width of image.");
     AddOutput("Mask",
-              "(Tensor) The Mask tensor of pooling operator."
+              "(Tensor), the Mask tensor of pooling operator."
               "The format of output tensor is also NCDHW."
               "Where N is batch size, C is the number of channels, D, H and W "
               "is the depth, height and width of image."
               "The value in it is the index in current feature map");
 
-    AddAttr<std::vector<int>>(
-        "ksize",
-        "(vector ), the pooling window size(depth, height, width) of pooling "
-        "operator."
-        "If globalPooling = true, ksize is ignored and need not be "
-        "specified.");  // TODO(Chengduo): Add checker. (Currently,
+    AddAttr<std::vector<int>>("ksize",
+                              "(vector), the pooling window size(depth, "
+                              "height, width) of pooling "
+                              "operator."
+                              "If globalPooling = true, ksize and paddings "
+                              "will be ignored.");  // TODO(Chengduo): Add
+                                                    // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
-    AddAttr<bool>("globalPooling",
-                  "(bool default: false), whether to use the global pooling."
-                  "If globalPooling = true, ksize is ignored.")
+    AddAttr<bool>(
+        "globalPooling",
+        "(bool default: false), whether to use the global pooling."
+        "If globalPooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "(vector, default:{1,1,1}), strides(depth, "
                               "height, width) of pooling operator.")
         .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
-    AddAttr<std::vector<int>>("paddings",
-                              "(vector defalut:{0,0,0}), paddings(depth, "
-                              "height, width) of pooling operator.")
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector defalut:{0,0,0}), paddings(depth, "
+        "height, width) of pooling operator."
+        "If globalPooling = true, paddings and ksize will be ignored.")
         .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
index 01b961ca82..4862774043 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -37,6 +37,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     if (context.Attr<bool>("globalPooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
     }
@@ -54,6 +55,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
         pool3d_forward(context.device_context(), *in_x, *out, *mask, ksize,
                        strides, paddings);
       } break;
+      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
     }
   }
 };
@@ -72,6 +74,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     if (context.Attr<bool>("globalPooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
       }
     }
@@ -95,6 +98,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
           pool3d_backward(context.device_context(), *in_x_grad, *out_grad,
                           *mask, ksize, strides, paddings);
         } break;
+        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
       }
     }
   }
diff --git a/python/paddle/v2/framework/tests/test_pool2d_op.py b/python/paddle/v2/framework/tests/test_pool2d_op.py
index f04de8133a..c93469e119 100644
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
@@ -49,9 +49,12 @@ class TestPool2d_Op(OpTest):
         self.init_test_case()
         self.init_op_type()
         self.init_pool_type()
+        if self.global_pool:
+            self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype("float32")
         output = self.pool2D_forward_naive(input, self.ksize, self.strides,
-                                           self.paddings, self.global_pool)
+                                           self.paddings,
+                                           self.global_pool).astype("float32")
         self.inputs = {'X': input}
 
         self.attrs = {
diff --git a/python/paddle/v2/framework/tests/test_pool3d_op.py b/python/paddle/v2/framework/tests/test_pool3d_op.py
index d62fbee974..416f0df7cd 100644
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
@@ -54,10 +54,13 @@ def avg_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
 
 class TestPool3d_Op(OpTest):
     def setUp(self):
-        self.initTestCase()
+        self.init_test_case()
+        if self.global_pool:
+            self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype("float32")
         output = self.pool3D_forward_naive(input, self.ksize, self.strides,
-                                           self.paddings, self.global_pool)
+                                           self.paddings,
+                                           self.global_pool).astype("float32")
         self.inputs = {'X': input}
 
         self.attrs = {
@@ -77,7 +80,7 @@ class TestPool3d_Op(OpTest):
         if self.pool_type != "max":
             self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
 
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = True
         self.op_type = "pool3d"
         self.pool_type = "avg"
@@ -89,7 +92,7 @@ class TestPool3d_Op(OpTest):
 
 
 class TestCase1(TestPool3d_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = False
         self.op_type = "pool3d"
         self.pool_type = "avg"
@@ -101,7 +104,7 @@ class TestCase1(TestPool3d_Op):
 
 
 class TestCase2(TestPool3d_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = False
         self.op_type = "pool3d"
         self.pool_type = "avg"
@@ -113,7 +116,7 @@ class TestCase2(TestPool3d_Op):
 
 
 class TestCase3(TestPool3d_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = True
         self.op_type = "pool3d"
         self.pool_type = "max"
@@ -125,7 +128,7 @@ class TestCase3(TestPool3d_Op):
 
 
 class TestCase4(TestPool3d_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = False
         self.op_type = "pool3d"
         self.pool_type = "max"
@@ -137,7 +140,7 @@ class TestCase4(TestPool3d_Op):
 
 
 class TestCase5(TestPool3d_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = False
         self.op_type = "pool3d"
         self.pool_type = "max"
diff --git a/python/paddle/v2/framework/tests/test_pool_max_op.py b/python/paddle/v2/framework/tests/test_pool_max_op.py
index f0f8aa6089..cc1a867761 100644
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
@@ -3,11 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-def max_pool3D_forward_naive(x,
-                             ksize,
-                             strides,
-                             paddings=[0, 0, 0],
-                             global_pool=0):
+def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
 
     N, C, D, H, W = x.shape
     if global_pool == 1:
@@ -44,7 +40,7 @@ def max_pool3D_forward_naive(x,
     return out, mask
 
 
-def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
 
     N, C, H, W = x.shape
     if global_pool == 1:
@@ -77,10 +73,14 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
 
 class TestMaxPoolWithIndex_Op(OpTest):
     def setUp(self):
-        self.initTestCase()
+        self.init_test_case()
+        if self.global_pool:
+            self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype("float32")
         output, mask = self.pool_forward_naive(input, self.ksize, self.strides,
                                                self.paddings, self.global_pool)
+        output = output.astype("float32")
+        mask = mask.astype("float32")
 
         self.attrs = {
             'strides': self.strides,
@@ -98,7 +98,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
     # def test_check_grad(self):
     #     self.check_grad(set(['X']), ['Out'], max_relative_error=0.07)
 
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = True
         self.index = "max_pool3d_with_index"
         self.op_type = "%s" % self.index
@@ -110,7 +110,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
 
 
 class TestCase1(TestMaxPoolWithIndex_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = True
         self.op_type = "max_pool3d_with_index"
         self.pool_forward_naive = max_pool3D_forward_naive
@@ -121,7 +121,7 @@ class TestCase1(TestMaxPoolWithIndex_Op):
 
 
 class TestCase2(TestMaxPoolWithIndex_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = False
         self.op_type = "max_pool3d_with_index"
         self.pool_forward_naive = max_pool3D_forward_naive
@@ -132,7 +132,7 @@ class TestCase2(TestMaxPoolWithIndex_Op):
 
 
 class TestCase3(TestMaxPoolWithIndex_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = False
         self.op_type = "max_pool3d_with_index"
         self.pool_forward_naive = max_pool3D_forward_naive
@@ -143,7 +143,7 @@ class TestCase3(TestMaxPoolWithIndex_Op):
 
 
 class TestCase4(TestMaxPoolWithIndex_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = True
         self.op_type = "max_pool3d_with_index"
         self.pool_forward_naive = max_pool3D_forward_naive
@@ -154,7 +154,7 @@ class TestCase4(TestMaxPoolWithIndex_Op):
 
 
 class TestCase5(TestMaxPoolWithIndex_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = True
         self.op_type = "max_pool3d_with_index"
         self.pool_forward_naive = max_pool3D_forward_naive
@@ -165,7 +165,7 @@ class TestCase5(TestMaxPoolWithIndex_Op):
 
 
 class TestCase6(TestMaxPoolWithIndex_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = False
         self.op_type = "max_pool2d_with_index"
         self.pool_forward_naive = max_pool2D_forward_naive
@@ -176,7 +176,7 @@ class TestCase6(TestMaxPoolWithIndex_Op):
 
 
 class TestCase7(TestMaxPoolWithIndex_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = False
         self.op_type = "max_pool2d_with_index"
         self.pool_forward_naive = max_pool2D_forward_naive
@@ -187,7 +187,7 @@ class TestCase7(TestMaxPoolWithIndex_Op):
 
 
 class TestCase8(TestMaxPoolWithIndex_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = True
         self.op_type = "max_pool2d_with_index"
         self.pool_forward_naive = max_pool2D_forward_naive
@@ -198,7 +198,7 @@ class TestCase8(TestMaxPoolWithIndex_Op):
 
 
 class TestCase9(TestMaxPoolWithIndex_Op):
-    def initTestCase(self):
+    def init_test_case(self):
         self.global_pool = True
         self.op_type = "max_pool2d_with_index"
         self.pool_forward_naive = max_pool2D_forward_naive

From 8f6c0a0fadb3a67d3241a61cffcb388dcfd47092 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 28 Oct 2017 13:59:18 -0700
Subject: [PATCH 53/81] Extract InferShape to many cc files (#5174)

* Shrink Operator.h

* Fix CI compile
---
 paddle/framework/CMakeLists.txt     |   5 +-
 paddle/framework/op_desc.cc         | 132 ++++++++++++++-
 paddle/framework/op_registry.h      |   1 +
 paddle/framework/operator.cc        | 132 +++++++++++++++
 paddle/framework/operator.h         | 248 +---------------------------
 paddle/framework/shape_inference.cc |  54 ++++++
 paddle/framework/shape_inference.h  |  50 ++----
 7 files changed, 334 insertions(+), 288 deletions(-)
 create mode 100644 paddle/framework/shape_inference.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index f69a3cfbf8..f4fef055da 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -24,9 +24,10 @@ cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog)
+cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info operator glog)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 133869e7b5..c2d6f124ad 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -16,15 +16,51 @@ limitations under the License. */
 #include <functional>
 #include <mutex>
 #include <unordered_map>
+#include "glog/logging.h"
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
-
-#include "glog/logging.h"
+#include "paddle/framework/shape_inference.h"
 
 namespace paddle {
 namespace framework {
 
+class OpDescBind;
+class BlockDescBind;
+class CompileTimeInferShapeContext : public InferShapeContext {
+ public:
+  CompileTimeInferShapeContext(const OpDescBind &op,
+                               const BlockDescBind &block);
+
+  bool HasInput(const std::string &name) const override;
+
+  bool HasOutput(const std::string &name) const override;
+
+  bool HasInputs(const std::string &name) const override;
+
+  bool HasOutputs(const std::string &name) const override;
+
+  DDim GetInputDim(const std::string &name) const override;
+
+  void SetOutputDim(const std::string &name, const DDim &dim) override;
+
+  AttrReader Attrs() const override;
+
+  const std::vector<std::string> &Inputs(
+      const std::string &name) const override;
+
+  const std::vector<std::string> &Outputs(
+      const std::string &name) const override;
+
+ private:
+  DDim GetDim(const std::string &name) const override;
+
+  void SetDim(const std::string &name, const DDim &dim) override;
+
+  const OpDescBind &op_;
+  const BlockDescBind &block_;
+};
+
 OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const AttributeMap &attrs) {
@@ -288,5 +324,97 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
   }
 }
 
+CompileTimeInferShapeContext::CompileTimeInferShapeContext(
+    const OpDescBind &op, const BlockDescBind &block)
+    : op_(op), block_(block) {}
+
+bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
+  const std::vector<std::string> &input_names = op_.Input(name);
+  auto length = input_names.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input(%s) should have only one value, "
+                    "but it have %d now",
+                    name, length);
+  return block_.HasVarRecursive(input_names[0]);
+}
+
+bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
+  const std::vector<std::string> &output_names = op_.Output(name);
+  auto length = output_names.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output(%s) should have only one value, "
+                    "but it have %d now",
+                    name, length);
+  return block_.HasVarRecursive(output_names[0]);
+}
+
+bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const {
+  const std::vector<std::string> &input_names = op_.Input(name);
+  if (input_names.empty()) {
+    return false;
+  }
+  for (auto &input : input_names) {
+    if (!block_.HasVarRecursive(input)) return false;
+  }
+  return true;
+}
+
+bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
+  const std::vector<std::string> &output_names = op_.Output(name);
+  if (output_names.empty()) {
+    return false;
+  }
+  for (auto &output : output_names) {
+    if (!block_.HasVarRecursive(output)) return false;
+  }
+  return true;
+}
+
+DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
+  std::vector<DDim> ddims = GetInputsDim(name);
+  auto length = ddims.size();
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input(%s) should have 1 value, "
+                    "but it has %d now",
+                    name, length);
+  return ddims[0];
+}
+
+void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
+                                                const DDim &dim) {
+  SetOutputsDim(name, {dim});
+}
+
+AttrReader CompileTimeInferShapeContext::Attrs() const {
+  return AttrReader(op_.GetAttrMap());
+}
+
+const std::vector<std::string> &CompileTimeInferShapeContext::Inputs(
+    const std::string &name) const {
+  return op_.Input(name);
+}
+
+const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
+    const std::string &name) const {
+  return op_.Output(name);
+}
+
+DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  return framework::make_ddim(var->Shape());
+}
+
+void CompileTimeInferShapeContext::SetDim(const std::string &name,
+                                          const DDim &dim) {
+  block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index ed85c386ec..deacf41f99 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
+#include "paddle/framework/shape_inference.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index db154e4f76..9e1e955aae 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
+#include "paddle/framework/shape_inference.h"
 
 namespace paddle {
 namespace framework {
@@ -273,5 +274,136 @@ bool OpSupportGPU(const std::string& op_type) {
   return false;
 }
 
+class RuntimeInferShapeContext : public InferShapeContext {
+ public:
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
+
+  bool HasInput(const std::string& name) const override {
+    auto& ins = Inputs(name);
+    size_t length = ins.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
+                      name);
+    auto ipt = ins[0];
+    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+    return var != nullptr;
+  }
+
+  bool HasOutput(const std::string& name) const override {
+    auto& outs = Outputs(name);
+    size_t length = outs.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
+                      name);
+    auto ipt = outs[0];
+    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+    return var != nullptr;
+  }
+
+  bool HasInputs(const std::string& name) const override {
+    auto inputs = op_.Inputs(name);
+    if (inputs.empty()) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HasOutputs(const std::string& name) const override {
+    auto outputs = op_.Outputs(name);
+    if (outputs.empty()) {
+      return false;
+    }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  DDim GetInputDim(const std::string& name) const override {
+    return GetDim(op_.Input(name));
+  }
+
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
+    SetDim(op_.Output(name), dim);
+  }
+
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
+
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Inputs(name);
+  }
+
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Outputs(name);
+  }
+
+ private:
+  DDim GetDim(const std::string& name) const override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<LoDTensor>()) {
+      return var->Get<LoDTensor>().dims();
+    } else if (var->IsType<SelectedRows>()) {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    } else {
+      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+    }
+  }
+
+  void SetDim(const std::string& name, const DDim& dim) override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<LoDTensor>()) {
+      var->GetMutable<LoDTensor>()->Resize(dim);
+    } else if (var->IsType<SelectedRows>()) {
+      var->GetMutable<SelectedRows>()->set_height(dim[0]);
+    } else {
+      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+    }
+  }
+
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
+void OperatorWithKernel::Run(const Scope& scope,
+                             const platform::DeviceContext& dev_ctx) const {
+  VLOG(3) << "Running operator " << this->Type();
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+
+  ExecutionContext ctx(*this, scope, dev_ctx);
+
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  if (kernels_iter == all_op_kernels.end()) {
+    PADDLE_THROW("op[%s] has no kernel", type_);
+  }
+
+  // check if op[type] have kernel for kernel_key
+  OpKernelMap& kernels = kernels_iter->second;
+  auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
+  auto kernel_iter = kernels.find(kernel_key);
+
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op[%s] has no kernel with kernel_key[%s]", type_, kernel_key);
+  }
+
+  kernel_iter->second->Compute(ctx);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index aa79f16df8..3a9c7a7328 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/selected_rows.h"
-#include "paddle/framework/shape_inference.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
@@ -317,226 +316,6 @@ template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const;
 
-class CompileTimeInferShapeContext : public InferShapeContext {
- public:
-  CompileTimeInferShapeContext(const OpDescBind& op, const BlockDescBind& block)
-      : op_(op), block_(block) {}
-
-  bool HasInput(const std::string& name) const override {
-    const std::vector<std::string>& input_names = op_.Input(name);
-    auto length = input_names.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Input(%s) should have only one value, "
-                      "but it have %d now",
-                      name, length);
-    return block_.HasVarRecursive(input_names[0]);
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    const std::vector<std::string>& output_names = op_.Output(name);
-    auto length = output_names.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Output(%s) should have only one value, "
-                      "but it have %d now",
-                      name, length);
-    return block_.HasVarRecursive(output_names[0]);
-  }
-
-  bool HasInputs(const std::string& name) const override {
-    const std::vector<std::string>& input_names = op_.Input(name);
-    if (input_names.empty()) {
-      return false;
-    }
-    for (auto& input : input_names) {
-      if (!block_.HasVarRecursive(input)) return false;
-    }
-    return true;
-  }
-
-  bool HasOutputs(const std::string& name) const override {
-    const std::vector<std::string>& output_names = op_.Output(name);
-    if (output_names.empty()) {
-      return false;
-    }
-    for (auto& output : output_names) {
-      if (!block_.HasVarRecursive(output)) return false;
-    }
-    return true;
-  }
-
-  DDim GetInputDim(const std::string& name) const override {
-    std::vector<DDim> ddims = GetInputsDim(name);
-    auto length = ddims.size();
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Input(%s) should have 1 value, "
-                      "but it has %d now",
-                      name, length);
-    return ddims[0];
-  }
-
-  void SetInputDim(const std::string& name, const DDim& dim) override {
-    SetInputsDim(name, {dim});
-  }
-
-  DDim GetOutputDim(const std::string& name) const override {
-    std::vector<DDim> ddims = GetOutputsDim(name);
-    auto length = ddims.size();
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Output(%s) should have 1 value, "
-                      "but it has %d now",
-                      name, length);
-    return ddims[0];
-  }
-
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    SetOutputsDim(name, {dim});
-  }
-
-  AttrReader Attrs() const override { return AttrReader(op_.GetAttrMap()); }
-
-  const std::vector<std::string>& Inputs(
-      const std::string& name) const override {
-    return op_.Input(name);
-  }
-
-  const std::vector<std::string>& Outputs(
-      const std::string& name) const override {
-    return op_.Output(name);
-  }
-
- private:
-  DDim GetDim(const std::string& name) const override {
-    auto var = block_.FindVarRecursive(name);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-    return framework::make_ddim(var->Shape());
-  }
-
-  void SetDim(const std::string& name, const DDim& dim) override {
-    block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
-  }
-
-  const OpDescBind& op_;
-  const BlockDescBind& block_;
-};
-
-class RuntimeInferShapeContext : public InferShapeContext {
- public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
-
-  bool HasInput(const std::string& name) const override {
-    auto& ins = Inputs(name);
-    size_t length = ins.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
-                      name);
-    auto ipt = ins[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    auto& outs = Outputs(name);
-    size_t length = outs.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
-                      name);
-    auto ipt = outs[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
-  }
-
-  bool HasInputs(const std::string& name) const override {
-    auto inputs = op_.Inputs(name);
-    if (inputs.empty()) {
-      return false;
-    }
-    for (auto& input : inputs) {
-      if (scope_.FindVar(input) == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool HasOutputs(const std::string& name) const override {
-    auto outputs = op_.Outputs(name);
-    if (outputs.empty()) {
-      return false;
-    }
-    for (auto& output : outputs) {
-      if (scope_.FindVar(output) == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  DDim GetInputDim(const std::string& name) const override {
-    return GetDim(op_.Input(name));
-  }
-
-  void SetInputDim(const std::string& name, const DDim& dim) override {
-    SetDim(op_.Input(name), dim);
-  }
-
-  DDim GetOutputDim(const std::string& name) const override {
-    return GetDim(op_.Output(name));
-  }
-
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    SetDim(op_.Output(name), dim);
-  }
-
-  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
-
-  const std::vector<std::string>& Inputs(
-      const std::string& name) const override {
-    return op_.Inputs(name);
-  }
-
-  const std::vector<std::string>& Outputs(
-      const std::string& name) const override {
-    return op_.Outputs(name);
-  }
-
- private:
-  DDim GetDim(const std::string& name) const override {
-    Variable* var = scope_.FindVar(name);
-    if (var->IsType<LoDTensor>()) {
-      return var->Get<LoDTensor>().dims();
-    } else if (var->IsType<SelectedRows>()) {
-      return var->Get<SelectedRows>().GetCompleteDims();
-    } else {
-      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
-    }
-  }
-
-  void SetDim(const std::string& name, const DDim& dim) override {
-    Variable* var = scope_.FindVar(name);
-    if (var->IsType<LoDTensor>()) {
-      var->GetMutable<LoDTensor>()->Resize(dim);
-    } else if (var->IsType<SelectedRows>()) {
-      var->GetMutable<SelectedRows>()->set_height(dim[0]);
-    } else {
-      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
-    }
-  }
-
-  const OperatorBase& op_;
-  const Scope& scope_;
-};
-
 class OpKernelBase {
  public:
   /**
@@ -595,32 +374,7 @@ class OperatorWithKernel : public OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const final {
-    VLOG(3) << "Running operator " << this->Type();
-    RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-    this->InferShape(&infer_shape_ctx);
-
-    ExecutionContext ctx(*this, scope, dev_ctx);
-
-    // check if op[type] has kernel registered.
-    auto& all_op_kernels = AllOpKernels();
-    auto kernels_iter = all_op_kernels.find(type_);
-    if (kernels_iter == all_op_kernels.end()) {
-      PADDLE_THROW("op[%s] has no kernel", type_);
-    }
-
-    // check if op[type] have kernel for kernel_key
-    OpKernelMap& kernels = kernels_iter->second;
-    auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
-    auto kernel_iter = kernels.find(kernel_key);
-
-    if (kernel_iter == kernels.end()) {
-      PADDLE_THROW("op[%s] has no kernel with kernel_key[%s]", type_,
-                   kernel_key);
-    }
-
-    kernel_iter->second->Compute(ctx);
-  }
+           const platform::DeviceContext& dev_ctx) const final;
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
   AllOpKernels() {
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
new file mode 100644
index 0000000000..33a1d0b9b2
--- /dev/null
+++ b/paddle/framework/shape_inference.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/shape_inference.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<framework::DDim> InferShapeContext::GetInputsDim(
+    const std::string &name) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return GetDims(names);
+}
+
+void InferShapeContext::SetOutputsDim(
+    const std::string &name, const std::vector<framework::DDim> &dims) {
+  auto &names = Outputs(name);
+  SetDims(names, dims);
+}
+
+void InferShapeContext::ShareLoD(const std::string &in, const std::string &out,
+                                 size_t i, size_t j) const {}
+
+std::vector<framework::DDim> InferShapeContext::GetDims(
+    const std::vector<std::string> &names) const {
+  std::vector<framework::DDim> ret;
+  ret.reserve(names.size());
+  std::transform(
+      names.begin(), names.end(), std::back_inserter(ret),
+      [this](const std::string &name) { return this->GetDim(name); });
+  return ret;
+}
+
+void InferShapeContext::SetDims(const std::vector<std::string> &names,
+                                const std::vector<framework::DDim> &dims) {
+  size_t length = names.size();
+  PADDLE_ENFORCE_EQ(length, dims.size());
+  for (size_t i = 0; i < length; ++i) {
+    SetDim(names[i], dims[i]);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index b93f980cf6..f1f1e44bcc 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/framework/attribute.h"
 #include "paddle/framework/ddim.h"
 
 namespace paddle {
@@ -21,7 +22,7 @@ namespace framework {
 
 class InferShapeContext {
  public:
-  virtual ~InferShapeContext() {}
+  virtual ~InferShapeContext() = default;
   virtual bool HasInput(const std::string &name) const = 0;
   virtual bool HasOutput(const std::string &name) const = 0;
 
@@ -29,57 +30,32 @@ class InferShapeContext {
   virtual bool HasOutputs(const std::string &name) const = 0;
 
   virtual framework::DDim GetInputDim(const std::string &name) const = 0;
-  std::vector<framework::DDim> GetInputsDim(const std::string &name) const {
-    const std::vector<std::string> &names = Inputs(name);
-    return GetDims(names);
-  }
-  virtual void SetInputDim(const std::string &name,
-                           const framework::DDim &dim) = 0;
-  void SetInputsDim(const std::string &name,
-                    const std::vector<framework::DDim> &dims) {
-    auto &names = Inputs(name);
-    SetDims(names, dims);
-  }
-  virtual framework::DDim GetOutputDim(const std::string &name) const = 0;
-  std::vector<framework::DDim> GetOutputsDim(const std::string &name) const {
-    const std::vector<std::string> &names = Outputs(name);
-    return GetDims(names);
-  }
+
+  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+
   virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
   void SetOutputsDim(const std::string &name,
-                     const std::vector<framework::DDim> &dims) {
-    auto &names = Outputs(name);
-    SetDims(names, dims);
-  }
+                     const std::vector<framework::DDim> &dims);
+
   virtual AttrReader Attrs() const = 0;
   virtual const std::vector<std::string> &Inputs(
       const std::string &name) const = 0;
   virtual const std::vector<std::string> &Outputs(
       const std::string &name) const = 0;
+
   // TODO(qiao) implement this function
   void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
-                size_t j = 0) const {}
+                size_t j = 0) const;
 
  protected:
   virtual framework::DDim GetDim(const std::string &name) const = 0;
   virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
+
   std::vector<framework::DDim> GetDims(
-      const std::vector<std::string> &names) const {
-    std::vector<framework::DDim> ret;
-    ret.reserve(names.size());
-    std::transform(
-        names.begin(), names.end(), std::back_inserter(ret),
-        [this](const std::string &name) { return this->GetDim(name); });
-    return ret;
-  }
+      const std::vector<std::string> &names) const;
+
   void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims) {
-    size_t length = names.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
-    for (size_t i = 0; i < length; ++i) {
-      SetDim(names[i], dims[i]);
-    }
-  }
+               const std::vector<framework::DDim> &dims);
 };
 
 }  // namespace framework

From 3ecad8ae65df6050269f8faf6e000b2e13af4af2 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 28 Oct 2017 14:43:09 -0700
Subject: [PATCH 54/81] Enable xe unittest (#5180)

---
 python/paddle/v2/framework/tests/test_cross_entropy_op.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 8b94539dcd..6f28ce723a 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -92,5 +92,4 @@ class TestCrossEntropyOp3(OpTest):
 
 
 if __name__ == "__main__":
-    exit(0)  # Gradient operator has bug!
     unittest.main()

From 008f40ce09f0d06bade1ae596dff87a9ba352c4e Mon Sep 17 00:00:00 2001
From: QI JUN <qijun1994@hotmail.com>
Date: Sat, 28 Oct 2017 15:01:44 -0700
Subject: [PATCH 55/81] support sparse output for lookup table grad op (#5145)

* add sparse support for sum op

* typo fix

* fix gpu build error

* fix unittest error

* typo fix

* infer var type and shape in op_test

* follow comments

* fix build error

* bypass some unittests depend on NetOp

* support sparse output for lookup table grad op

* refine codes

* fix gpu build error

* fix lookup table grad gpu kernel

* fix ci

* fix ci

* fix ci

* fix bug in lookup_table_grad op

* fix bug in test_word2vec

* register double kernel for some operators

* set is_sparse=True in test_word2vec

* fix lookup table grad op CUDA kernel bug

* disable test_modified_huber_loss_op temporarily

* disable test_lstm_unit_op temporarily
---
 paddle/operators/cross_entropy_op.cu          |   8 +-
 paddle/operators/cross_entropy_op.h           |  14 +--
 paddle/operators/feed_op.cc                   |   2 +-
 paddle/operators/lookup_table_op.cc           |  44 +++++++-
 paddle/operators/lookup_table_op.cu           | 100 ++++++++++++------
 paddle/operators/lookup_table_op.h            |  70 ++++++++----
 paddle/operators/math/cross_entropy.cc        |   2 +-
 paddle/operators/math/cross_entropy.cu        |   4 +-
 paddle/operators/sgd_op.cc                    |   5 +-
 paddle/operators/sgd_op.cu                    |   5 +-
 paddle/operators/sum_op.h                     |   9 --
 paddle/operators/uniform_random_op.cc         |   3 +-
 paddle/operators/uniform_random_op.cu         |   3 +-
 paddle/pybind/tensor_py.h                     |   3 +-
 python/paddle/v2/framework/layers.py          |   4 +-
 .../framework/tests/test_cross_entropy_op.py  |   2 +-
 .../paddle/v2/framework/tests/test_layers.py  |  10 +-
 .../framework/tests/test_lookup_table_op.py   |   2 +-
 .../v2/framework/tests/test_lstm_unit_op.py   |   7 +-
 .../tests/test_modified_huber_loss_op.py      |   2 +
 .../tests/test_recognize_digits_conv.py       |   4 +-
 .../tests/test_recognize_digits_mlp.py        |   4 +-
 .../v2/framework/tests/test_word2vec.py       |  25 +++--
 23 files changed, 218 insertions(+), 114 deletions(-)

diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 5f8a6cd5ef..a523cb6fce 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -21,7 +21,7 @@ namespace {
 
 template <typename T>
 __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                           const int* label, const int N,
+                                           const int64_t* label, const int N,
                                            const int D) {
   // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
   // CUDA_1D_KERNEL_LOOP(i, N) {
@@ -77,8 +77,8 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
     T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
     const T* x_data = x->data<T>();
 
-    int batch_size = x->dims()[0];
-    int class_num = x->dims()[1];
+    int64_t batch_size = x->dims()[0];
+    int64_t class_num = x->dims()[1];
 
     int block = 512;
     int grid = (batch_size * class_num + block - 1) / block;
@@ -93,7 +93,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
     } else {
       math::SetConstant<platform::GPUPlace, T> functor;
       functor(ctx.device_context(), dx, 0);
-      auto* label_data = label->data<int>();
+      auto* label_data = label->data<int64_t>();
       grid = (batch_size + block - 1) / block;
       CrossEntropyGradientKernel<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 42f282103b..37db0a930a 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -54,7 +54,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
     Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
-    int class_num = x->dims()[1];
+    int64_t class_num = x->dims()[1];
     if (ctx.Attr<bool>("soft_label")) {
       auto x_mat = EigenMatrix<T>::From(*x);
       auto dy_mat = EigenMatrix<T>::From(*dy);
@@ -62,20 +62,20 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
       auto dx_mat = EigenMatrix<T>::From(*dx);
 
       dx_mat.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
-          -(lbl_mat * dy_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) /
-            x_mat);
+          -(lbl_mat *
+            dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
     } else {
-      int batch_size = x->dims()[0];
+      int64_t batch_size = x->dims()[0];
       const T* dy_data = dy->data<T>();
       const T* x_data = x->data<T>();
-      const int* label_data = label->data<int>();
+      const int64_t* label_data = label->data<int64_t>();
 
       math::SetConstant<platform::CPUPlace, T> functor;
       functor(ctx.device_context(), dx, 0);
 
-      for (int i = 0; i < batch_size; ++i) {
+      for (int64_t i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
-        int index = i * class_num + label_data[i];
+        int64_t index = i * class_num + label_data[i];
         dx_data[index] = -dy_data[i] / x_data[index];
       }
     }
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 0f1722a538..0e5b263eae 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -41,7 +41,7 @@ class FeedOp : public framework::OperatorBase {
 
     auto col = Attr<int>("col");
 
-    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var"
+    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var "
             << out_name;
 
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index ad86a2e5bc..8fdd42352e 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/operators/lookup_table_op.h"
+#include "paddle/framework/var_type_inference.h"
 
 namespace paddle {
 namespace operators {
@@ -60,6 +61,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
              "Ids must be a column vector with rank = 2."
              "The 2nd dimension size must be 1");
     AddOutput("Out", "The lookup results, which have the same type with W.");
+    AddAttr<bool>("is_sparse", "Sparse update").SetDefault(false);
     AddComment(R"DOC(
 This operator is used to perform lookups on the parameter W,
 then concatenated into a dense tensor.
@@ -70,6 +72,15 @@ or not. And the output only shares the LoD with input `Ids`.
   }
 };
 
+class LookupTableOpGradDescMaker
+    : public framework::DefaultGradOpDescMaker<true> {
+  using ::paddle::framework::DefaultGradOpDescMaker<
+      true>::DefaultGradOpDescMaker;
+
+ protected:
+  virtual std::string GradOpType() const { return "lookup_table_grad"; }
+};
+
 class LookupTableOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -86,12 +97,35 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind& op_desc,
+                  framework::BlockDescBind* block) const override {
+    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto attr = op_desc.GetAttr("is_sparse");
+    bool is_sparse = boost::get<bool>(attr);
+    if (is_sparse) {
+      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
+              << " is set to SelectedRows";
+      block->Var(out_var_name)->SetType(framework::VarDesc::SELECTED_ROWS);
+    } else {
+      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
+              << " is set to LoDTensor";
+      block->Var(out_var_name)->SetType(framework::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
-            lookup_table_grad, ops::LookupTableOpGrad);
-
-REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>);
-REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>);
+REGISTER_OPERATOR(lookup_table, ops::LookupTableOp,
+                  ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker);
+REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
+                  ops::LookupTableOpGradVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
+                       ops::LookupTableKernel<double>);
+REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>,
+                       ops::LookupTableGradKernel<double>);
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index c3808fa9a8..837b2a1f4c 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,22 +11,21 @@
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/lookup_table_op.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTable(T* output, const T* table, const int32_t* ids,
-                            const int N, const int K, const int D) {
+__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
+                            const int64_t N, const int64_t K, const int64_t D) {
   int idx = threadIdx.x;
   int idy = blockIdx.x + threadIdx.y * GridDimX;
 
   while (idy < K) {
-    int id = ids[idy];
+    int64_t id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
     T* out = output + idy * D;
@@ -42,8 +38,9 @@ __global__ void LookupTable(T* output, const T* table, const int32_t* ids,
 }
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTableGrad(T* table, const T* output, const int32_t* ids,
-                                const int N, const int K, const int D) {
+__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
+                                const int64_t N, const int64_t K,
+                                const int64_t D) {
   int idx = threadIdx.x;
   int idy = blockIdx.x + threadIdx.y * GridDimX;
 
@@ -71,7 +68,7 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
     size_t N = table_t->dims()[0];
     size_t D = table_t->dims()[1];
     size_t K = ids_t->numel();
-    auto ids = ids_t->data<int32_t>();
+    auto ids = ids_t->data<int64_t>();
     auto table = table_t->data<T>();
     auto output = output_t->mutable_data<T>(context.GetPlace());
 
@@ -88,27 +85,63 @@ template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto ids_t = context.Input<Tensor>("Ids");
-    auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
-
-    int N = d_table_t->dims()[0];
-    int D = d_table_t->dims()[1];
-    int K = ids_t->numel();
-    const int32_t* ids = ids_t->data<int32_t>();
-    const T* d_output = d_output_t->data<T>();
-    T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
-
-    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-    t.device(context.GetEigenDevice<platform::GPUPlace>()) =
-        t.constant(static_cast<T>(0));
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-    LookupTableGrad<T, 128, 8, 8><<<
-        grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    if (is_sparse) {
+      auto* ids = context.Input<Tensor>("Ids");
+      auto* table = context.Input<Tensor>("W");
+      auto* d_output = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
+
+      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                        context.device_context())
+                        .stream();
+      // copy GPU memory to CPU pinned memory
+      framework::Vector<int64_t> new_rows;
+      new_rows.resize(ids_dim[0]);
+      auto gpu_place = boost::get<platform::GPUPlace>(context.GetPlace());
+
+      memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
+                   ids_dim[0] * sizeof(int64_t), stream);
+
+      d_table->set_rows(new_rows);
+
+      auto* d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->mutable_data<T>(context.GetPlace());
+
+      auto* d_table_data = d_table_value->data<T>();
+      auto* d_output_data = d_output->data<T>();
+      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
+                   d_output->numel(), stream);
+
+    } else {
+      auto ids_t = context.Input<Tensor>("Ids");
+      auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
+
+      int N = d_table_t->dims()[0];
+      int D = d_table_t->dims()[1];
+      int K = ids_t->numel();
+      const int64_t* ids = ids_t->data<int64_t>();
+      const T* d_output = d_output_t->data<T>();
+      T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+
+      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+      t.device(context.GetEigenDevice<platform::GPUPlace>()) =
+          t.constant(static_cast<T>(0));
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+      LookupTableGrad<T, 128, 8,
+                      8><<<grids, threads, 0,
+                           reinterpret_cast<const platform::CUDADeviceContext&>(
                                context.device_context())
                                .stream()>>>(d_table, d_output, ids, N, K, D);
+    }
   }
 };
 
@@ -116,6 +149,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(lookup_table_grad,
-                       ops::LookupTableGradCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
+                       ops::LookupTableCUDAKernel<double>);
+REGISTER_OP_GPU_KERNEL(lookup_table_grad, ops::LookupTableGradCUDAKernel<float>,
+                       ops::LookupTableGradCUDAKernel<double>);
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
index dfead2fc5b..54067cd01d 100644
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,12 +12,15 @@
 #pragma once
 
 #include "paddle/framework/eigen.h"
+#include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/selected_rows.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using SelectedRows = framework::SelectedRows;
 
 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
@@ -32,7 +32,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
 
     int N = table_t->dims()[0];
     int D = table_t->dims()[1];
-    auto ids = ids_t->data<int32_t>();
+    auto ids = ids_t->data<int64_t>();
     auto table = table_t->data<T>();
     auto output = output_t->mutable_data<T>(context.GetPlace());
     for (int64_t i = 0; i < ids_t->numel(); ++i) {
@@ -47,25 +47,55 @@ template <typename T>
 class LookupTableGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto ids_t = context.Input<Tensor>("Ids");
-    auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    if (is_sparse) {
+      auto* ids = context.Input<Tensor>("Ids");
+      auto* table = context.Input<Tensor>("W");
+      auto* d_output = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-    int N = d_table_t->dims()[0];
-    int D = d_table_t->dims()[1];
-    auto ids = ids_t->data<int32_t>();
-    const T* d_output = d_output_t->data<T>();
-    T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
 
-    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-    t.device(context.GetEigenDevice<platform::CPUPlace>()) =
-        t.constant(static_cast<T>(0));
+      framework::Vector<int64_t> new_rows;
+      new_rows.reserve(ids_dim[0]);
+      for (int64_t i = 0; i < ids_dim[0]; i++) {
+        new_rows.push_back(ids_data[i]);
+      }
+      d_table->set_rows(new_rows);
 
-    for (int64_t i = 0; i < ids_t->numel(); ++i) {
-      PADDLE_ENFORCE_LT(ids[i], N);
-      PADDLE_ENFORCE_GE(ids[i], 0);
-      for (int j = 0; j < D; ++j) {
-        d_table[ids[i] * D + j] += d_output[i * D + j];
+      auto* d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->mutable_data<T>(context.GetPlace());
+
+      d_table->set_height(table->dims()[0]);
+
+      auto* d_output_data = d_output->data<T>();
+      auto* d_table_data = d_table_value->data<T>();
+
+      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+    } else {
+      auto* ids = context.Input<Tensor>("Ids");
+      auto* d_output = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<Tensor>(framework::GradVarName("W"));
+      auto* table = context.Input<Tensor>("W");
+
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
+
+      int N = table->dims()[0];
+      int D = d_output->dims()[1];
+
+      auto* d_output_data = d_output->data<T>();
+      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+
+      for (int64_t i = 0; i < ids->numel(); ++i) {
+        PADDLE_ENFORCE_LT(ids_data[i], N);
+        PADDLE_ENFORCE_GE(ids_data[i], 0);
+        for (int j = 0; j < D; ++j) {
+          d_table_data[ids_data[i] * D + j] = d_output_data[i * D + j];
+        }
       }
     }
   }
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc
index cb28add3f0..cf238a58e0 100644
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
@@ -44,7 +44,7 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
       const T* prob_data = prob->data<T>();
       T* loss_data = out->data<T>();
 
-      const int* label_data = labels->data<int>();
+      const int64_t* label_data = labels->data<int64_t>();
       for (int i = 0; i < batch_size; ++i) {
         int index = i * class_num + label_data[i];
         loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu
index 80db130aa0..651c08f740 100644
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
@@ -20,7 +20,7 @@ namespace math {
 
 namespace {
 template <typename T>
-__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                    const int N, const int D) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
@@ -115,7 +115,7 @@ class CrossEntropyFunctor<platform::GPUPlace, T> {
           reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
           loss_data, prob_data, label_data, class_num);
     } else {
-      const int* label_data = labels->data<int>();
+      const int64_t* label_data = labels->data<int64_t>();
       int block = 512;
       int grid = (batch_size + block - 1) / block;
       CrossEntropyKernel<T><<<
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 2acb96d1b4..939176c73d 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -89,11 +89,12 @@ struct SparseSGDFunctor<platform::CPUPlace, T> {
 };
 
 template struct SparseSGDFunctor<platform::CPUPlace, float>;
+template struct SparseSGDFunctor<platform::CPUPlace, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker);
-REGISTER_OP_CPU_KERNEL(sgd,
-                       ops::SGDOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::SGDOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 106f9b746b..2f41c7fc12 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -71,10 +71,11 @@ struct SparseSGDFunctor<platform::GPUPlace, T> {
 };
 
 template struct SparseSGDFunctor<platform::GPUPlace, float>;
+template struct SparseSGDFunctor<platform::GPUPlace, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sgd,
-                       ops::SGDOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<paddle::platform::GPUPlace, float>,
+                       ops::SGDOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index a4be6b61b9..f2f2c67bc3 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -35,13 +35,6 @@ class SumKernel : public framework::OpKernel<T> {
 
     if (out_var->IsType<framework::LoDTensor>()) {
       auto* out = context.Output<Tensor>("Out");
-      // Runtime InferShape
-      for (int i = 0; i < N; i++) {
-        if (in_vars[i]->IsType<framework::LoDTensor>()) {
-          out->Resize(in_vars[i]->Get<framework::LoDTensor>().dims());
-          break;
-        }
-      }
       out->mutable_data<T>(context.GetPlace());
 
       auto result = EigenVector<T>::Flatten(*out);
@@ -73,12 +66,10 @@ class SumKernel : public framework::OpKernel<T> {
         first_dim += in_vars[i]->Get<SelectedRows>().rows().size();
       }
       auto in_dim = in_vars[0]->Get<SelectedRows>().value().dims();
-
       auto in_dim_vec = framework::vectorize(in_dim);
       in_dim_vec[0] = static_cast<int64_t>(first_dim);
 
       out_value->Resize(framework::make_ddim(in_dim_vec));
-
       out_value->mutable_data<T>(context.GetPlace());
 
       math::SelectedRowsAddTo<Place, T> functor;
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 39b53948e3..82f9b8fbf1 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -95,4 +95,5 @@ Used to initialize tensor with uniform random generator.
 REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp,
                              paddle::operators::UniformRandomOpMaker);
 REGISTER_OP_CPU_KERNEL(uniform_random,
-                       paddle::operators::CPUUniformRandomKernel<float>);
+                       paddle::operators::CPUUniformRandomKernel<float>,
+                       paddle::operators::CPUUniformRandomKernel<double>);
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index 5612ce9eb1..8b20bb8287 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -64,4 +64,5 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 REGISTER_OP_GPU_KERNEL(uniform_random,
-                       paddle::operators::GPUUniformRandomKernel<float>);
+                       paddle::operators::GPUUniformRandomKernel<float>,
+                       paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 85f9f22733..f278e79af6 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -85,7 +85,8 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 }  // namespace details
 inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
   auto buffer_info =
-      details::CastToPyBufferImpl<true, 0, float, int, double>()(tensor);
+      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t>()(
+          tensor);
   return buffer_info;
 }
 
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 4bb763e6d9..7c87bfaece 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -61,6 +61,7 @@ def fc(input,
 def embedding(input,
               size,
               data_type='float32',
+              is_sparse=False,
               param_attr=None,
               program=None,
               init_program=None):
@@ -72,7 +73,8 @@ def embedding(input,
         type='lookup_table',
         inputs={'Ids': input,
                 'W': w},
-        outputs={'Out': tmp})
+        outputs={'Out': tmp},
+        attrs={'is_sparse': is_sparse})
     return tmp
 
 
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 6f28ce723a..b81af9364d 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -14,7 +14,7 @@ class TestCrossEntropyOp1(OpTest):
 
         X = randomize_probability(batch_size, class_num, dtype='float64')
 
-        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int32")
+        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
         cross_entropy = np.asmatrix(
             [[-np.log(X[i][label[i][0]])] for i in range(X.shape[0])],
             dtype="float64")
diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/framework/tests/test_layers.py
index 54f8a0270d..5cbe790e3f 100644
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
@@ -93,15 +93,15 @@ class TestBook(unittest.TestCase):
         dict_size = 10000
         embed_size = 32
         first_word = layers.data(
-            name='firstw', shape=[1], data_type='int32', program=program)
+            name='firstw', shape=[1], data_type='int64', program=program)
         second_word = layers.data(
-            name='secondw', shape=[1], data_type='int32', program=program)
+            name='secondw', shape=[1], data_type='int64', program=program)
         third_word = layers.data(
-            name='thirdw', shape=[1], data_type='int32', program=program)
+            name='thirdw', shape=[1], data_type='int64', program=program)
         forth_word = layers.data(
-            name='forthw', shape=[1], data_type='int32', program=program)
+            name='forthw', shape=[1], data_type='int64', program=program)
         next_word = layers.data(
-            name='nextw', shape=[1], data_type='int32', program=program)
+            name='nextw', shape=[1], data_type='int64', program=program)
 
         embed_first = layers.embedding(
             input=first_word,
diff --git a/python/paddle/v2/framework/tests/test_lookup_table_op.py b/python/paddle/v2/framework/tests/test_lookup_table_op.py
index 2c48f9bf93..a56a549e69 100644
--- a/python/paddle/v2/framework/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table_op.py
@@ -7,7 +7,7 @@ class TestLookupTableOp(OpTest):
     def setUp(self):
         self.op_type = "lookup_table"
         table = np.random.random((17, 31)).astype("float32")
-        ids = np.random.randint(0, 17, 4).astype("int32")
+        ids = np.random.randint(0, 17, 4).astype("int64")
         ids_expand = np.expand_dims(ids, axis=1)
         self.inputs = {'W': table, 'Ids': ids_expand}
         self.outputs = {'Out': table[ids]}
diff --git a/python/paddle/v2/framework/tests/test_lstm_unit_op.py b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
index cf0e25f5eb..6bad2e1f7c 100644
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
@@ -34,6 +34,7 @@ class LstmUnitTest(OpTest):
         self.check_grad(['X', 'C_prev'], ['C', 'H'])
 
 
-# TODO(gongwb):fix CI error
-#if __name__ == "__main__":
-#    unittest.main()
+if __name__ == "__main__":
+    # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
+    exit(0)
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py b/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
index bc8ee369d2..33de8ff721 100644
--- a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
@@ -45,4 +45,6 @@ class TestModifiedHuberLossOp(OpTest):
 
 
 if __name__ == '__main__':
+    exit(0)
+    # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
index 2b305213df..a9b6c8410e 100644
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
@@ -21,7 +21,7 @@ images = layers.data(
 label = layers.data(
     name='label',
     shape=[1],
-    data_type='int32',
+    data_type='int64',
     program=program,
     init_program=init_program)
 conv_pool_1 = nets.simple_img_conv_pool(
@@ -72,7 +72,7 @@ for pass_id in range(PASS_NUM):
     for data in train_reader():
         img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
                                 data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("int32")
+        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
         y_data = y_data.reshape([BATCH_SIZE, 1])
 
         tensor_img = core.LoDTensor()
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
index 44a768d5e2..a8a34b2a95 100644
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
@@ -52,7 +52,7 @@ predict = layers.fc(input=hidden2,
 label = layers.data(
     name='y',
     shape=[1],
-    data_type='int32',
+    data_type='int64',
     program=program,
     init_program=init_program)
 
@@ -77,7 +77,7 @@ PASS_NUM = 100
 for pass_id in range(PASS_NUM):
     for data in train_reader():
         x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("int32")
+        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
         y_data = np.expand_dims(y_data, axis=1)
 
         tensor_x = core.LoDTensor()
diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/framework/tests/test_word2vec.py
index f5e61bef0d..515d30d3e2 100644
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
@@ -15,6 +15,7 @@ embed_size = 32
 hidden_size = 256
 N = 5
 batch_size = 32
+is_sparse = True
 
 word_dict = paddle.dataset.imikolov.build_dict()
 dict_size = len(word_dict)
@@ -22,31 +23,31 @@ dict_size = len(word_dict)
 first_word = layers.data(
     name='firstw',
     shape=[1],
-    data_type='int32',
+    data_type='int64',
     program=program,
     init_program=init_program)
 second_word = layers.data(
     name='secondw',
     shape=[1],
-    data_type='int32',
+    data_type='int64',
     program=program,
     init_program=init_program)
 third_word = layers.data(
     name='thirdw',
     shape=[1],
-    data_type='int32',
+    data_type='int64',
     program=program,
     init_program=init_program)
 forth_word = layers.data(
     name='forthw',
     shape=[1],
-    data_type='int32',
+    data_type='int64',
     program=program,
     init_program=init_program)
 next_word = layers.data(
     name='nextw',
     shape=[1],
-    data_type='int32',
+    data_type='int64',
     program=program,
     init_program=init_program)
 
@@ -54,6 +55,7 @@ embed_first = layers.embedding(
     input=first_word,
     size=[dict_size, embed_size],
     data_type='float32',
+    is_sparse=is_sparse,
     param_attr={'name': 'shared_w'},
     program=program,
     init_program=init_program)
@@ -61,6 +63,7 @@ embed_second = layers.embedding(
     input=second_word,
     size=[dict_size, embed_size],
     data_type='float32',
+    is_sparse=is_sparse,
     param_attr={'name': 'shared_w'},
     program=program,
     init_program=init_program)
@@ -69,6 +72,7 @@ embed_third = layers.embedding(
     input=third_word,
     size=[dict_size, embed_size],
     data_type='float32',
+    is_sparse=is_sparse,
     param_attr={'name': 'shared_w'},
     program=program,
     init_program=init_program)
@@ -76,6 +80,7 @@ embed_forth = layers.embedding(
     input=forth_word,
     size=[dict_size, embed_size],
     data_type='float32',
+    is_sparse=is_sparse,
     param_attr={'name': 'shared_w'},
     program=program,
     init_program=init_program)
@@ -117,26 +122,26 @@ PASS_NUM = 100
 for pass_id in range(PASS_NUM):
     for data in train_reader():
         input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)]
-        input_data = map(lambda x: np.array(x).astype("int32"), input_data)
+        input_data = map(lambda x: np.array(x).astype("int64"), input_data)
         input_data = map(lambda x: np.expand_dims(x, axis=1), input_data)
 
         first_data = input_data[0]
         first_tensor = core.LoDTensor()
         first_tensor.set(first_data, place)
 
-        second_data = input_data[0]
+        second_data = input_data[1]
         second_tensor = core.LoDTensor()
         second_tensor.set(second_data, place)
 
-        third_data = input_data[0]
+        third_data = input_data[2]
         third_tensor = core.LoDTensor()
         third_tensor.set(third_data, place)
 
-        forth_data = input_data[0]
+        forth_data = input_data[3]
         forth_tensor = core.LoDTensor()
         forth_tensor.set(forth_data, place)
 
-        next_data = input_data[0]
+        next_data = input_data[4]
         next_tensor = core.LoDTensor()
         next_tensor.set(next_data, place)
 

From 46a13e37d7fc670d76f3dd034efabaab5a4601a0 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 28 Oct 2017 15:57:19 -0700
Subject: [PATCH 56/81] Polish Accuracy Op (#5191)

* Accuracy does not support float/double, only support integers
* Polish error message when an operator does not support some device.
---
 paddle/framework/operator.cc    | 5 +++--
 paddle/operators/accuracy_op.cc | 4 +---
 paddle/operators/accuracy_op.cu | 4 +---
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 9e1e955aae..222a252dc4 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -390,7 +390,8 @@ void OperatorWithKernel::Run(const Scope& scope,
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
   if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW("op[%s] has no kernel", type_);
+    PADDLE_THROW(
+        "There are no kernels which are registered in the %s operator.", type_);
   }
 
   // check if op[type] have kernel for kernel_key
@@ -399,7 +400,7 @@ void OperatorWithKernel::Run(const Scope& scope,
   auto kernel_iter = kernels.find(kernel_key);
 
   if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op[%s] has no kernel with kernel_key[%s]", type_, kernel_key);
+    PADDLE_THROW("The operator %s does not support %s", type_, kernel_key);
   }
 
   kernel_iter->second->Compute(ctx);
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index e0a00ecaf0..eb8bce8da7 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -70,7 +70,5 @@ information, or not. But the output only shares the LoD with input `Inference`.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
-    ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
-    ops::AccuracyKernel<paddle::platform::CPUPlace, double>,
+    accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
     ops::AccuracyKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index 54e6ab99dc..be58dfbd03 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -81,7 +81,5 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-                       paddle::operators::AccuracyOpCUDAKernel<double>,
-                       paddle::operators::AccuracyOpCUDAKernel<int>,
+REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<int>,
                        paddle::operators::AccuracyOpCUDAKernel<int64_t>);

From b84e8226514b8bb4405c3c28e54aa5077193d179 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 28 Oct 2017 16:30:55 -0700
Subject: [PATCH 57/81] Cast Operator (#5149)

* Cast Operator

Cast input variable to other data type

* Fix compile error

* Add cast op

* Follow comments
---
 paddle/framework/data_type.h                  | 20 +++++
 paddle/framework/op_registry.h                |  4 +
 paddle/operators/cast_op.cc                   | 73 +++++++++++++++++++
 paddle/operators/cast_op.cu                   | 22 ++++++
 paddle/operators/cast_op.h                    | 64 ++++++++++++++++
 python/paddle/v2/framework/layers.py          | 14 +++-
 .../paddle/v2/framework/tests/test_cast_op.py | 26 +++++++
 7 files changed, 222 insertions(+), 1 deletion(-)
 create mode 100644 paddle/operators/cast_op.cc
 create mode 100644 paddle/operators/cast_op.cu
 create mode 100644 paddle/operators/cast_op.h
 create mode 100644 python/paddle/v2/framework/tests/test_cast_op.py

diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index bafb4fbd48..c5ae7b1854 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -34,5 +34,25 @@ inline DataType ToDataType(std::type_index type) {
   }
 }
 
+template <typename Visitor>
+inline void VisitDataType(DataType type, Visitor visitor) {
+  switch (type) {
+    case DataType::FP32:
+      visitor.template operator()<float>();
+      break;
+    case DataType::FP64:
+      visitor.template operator()<double>();
+      break;
+    case DataType::INT32:
+      visitor.template operator()<int>();
+      break;
+    case DataType::INT64:
+      visitor.template operator()<int64_t>();
+      break;
+    default:
+      PADDLE_THROW("Not supported");
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index deacf41f99..2f461e7b2a 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -162,6 +162,10 @@ class OpKernelRegistrar : public Registrar {
   REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
                     op_maker_class);
 
+#define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
+  REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
+                    ##__VA_ARGS__)
+
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
   REGISTER_OPERATOR(op_type, op_class, op_maker_class)
 
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
new file mode 100644
index 0000000000..19187894c3
--- /dev/null
+++ b/paddle/operators/cast_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/cast_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CastOpProtoMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "the input tensor of cast op");
+    AddOutput("Out", "the output tensor of cast op");
+    AddComment(R"DOC(Cast operator.
+cast the input tensor to other data type.
+)DOC");
+    AddAttr<int>("out_data_type", "output data type");
+    AddAttr<int>("in_data_type", "input data type");
+  }
+};
+
+class CastOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"), "The input of cast op must be set");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "The output of cast op must be set");
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class CastOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto grad = new framework::OpDescBind();
+    grad->SetType("cast");
+    grad->SetInput("X", OutputGrad("Out"));
+    grad->SetOutput("Out", InputGrad("X"));
+    grad->SetAttr("out_data_type", GetAttr("in_data_type"));
+    grad->SetAttr("in_data_type", GetAttr("out_data_type"));
+    return std::unique_ptr<framework::OpDescBind>(grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUPlace;
+REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
+                        ops::CastOpProtoMaker);
+REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
+                       ops::CastOpKernel<CPU, double>,
+                       ops::CastOpKernel<CPU, int>,
+                       ops::CastOpKernel<CPU, int64_t>);
diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu
new file mode 100644
index 0000000000..fb75ddbabf
--- /dev/null
+++ b/paddle/operators/cast_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/cast_op.h"
+
+template <typename T>
+using CastOpKernel =
+    paddle::operators::CastOpKernel<paddle::platform::GPUPlace, T>;
+
+REGISTER_OP_GPU_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
+                       CastOpKernel<int>, CastOpKernel<int64_t>);
diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h
new file mode 100644
index 0000000000..ffdbff7030
--- /dev/null
+++ b/paddle/operators/cast_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename InT, typename OutT>
+struct CastOpTransformFunctor {
+  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
+};
+
+template <typename Place, typename InT>
+struct CastOpFunctor {
+  const framework::Tensor* in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext& ctx_;
+  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
+                const platform::DeviceContext& ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* in_begin = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutT>(ctx_.GetPlace());
+    platform::Transform<Place> trans;
+    trans(ctx_, in_begin, in_end, out_begin,
+          CastOpTransformFunctor<InT, OutT>());
+  }
+};
+
+template <typename Place, typename InT>
+class CastOpKernel : public framework::OpKernel<InT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    framework::VisitDataType(
+        static_cast<framework::DataType>(context.Attr<int>("out_data_type")),
+        CastOpFunctor<Place, InT>(in, out, context.device_context()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 7c87bfaece..9e6d5f49db 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -5,7 +5,7 @@ import re
 
 __all__ = [
     'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
-    'StaticRNN'
+    'StaticRNN', 'cast'
 ]
 
 
@@ -163,6 +163,18 @@ _create_op_func_('mul')
 _create_op_func_('dropout')
 
 
+def cast(x, data_type, program=None):
+    helper = LayerHelper('cast', **locals())
+    out = helper.create_tmp_variable(dtype=data_type)
+    helper.append_op(
+        type='cast',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'in_data_type': x.data_type,
+               'out_data_type': out.data_type})
+    return out
+
+
 def concat(input, axis, program=None, init_program=None):
     helper = LayerHelper('concat', **locals())
     if not isinstance(input, list) and not isinstance(input, tuple):
diff --git a/python/paddle/v2/framework/tests/test_cast_op.py b/python/paddle/v2/framework/tests/test_cast_op.py
new file mode 100644
index 0000000000..52ee71a8a4
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_cast_op.py
@@ -0,0 +1,26 @@
+import op_test
+import unittest
+import numpy as np
+import paddle.v2.framework.core as core
+
+
+class TestCastOp(op_test.OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float64')}
+        self.attrs = {
+            'in_data_type': int(core.DataType.FP32),
+            'out_data_type': int(core.DataType.FP64)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_grad(self):
+        self.check_grad(['X'], ['Out'])
+
+
+if __name__ == '__main__':
+    unittest.main()

From 71305e5f90f87dcdf6fc0ab619f41da1763e74c7 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Sun, 29 Oct 2017 13:50:34 -0700
Subject: [PATCH 58/81] "polish code based on comment"

---
 paddle/framework/operator.h      |  4 ++--
 paddle/operators/nccl_op.cc      |  5 +++++
 paddle/operators/nccl_op.cu      |  5 ++---
 paddle/operators/nccl_op_test.cu | 10 ++++------
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 3236250366..a2544f1dcd 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -290,12 +290,12 @@ class ExecutionContext {
     return device_context_;
   }
 
-  //! Get variables vector with same input name.
+  //! Get actual name vector for this input.
   const std::vector<std::string>& Inputs(const std::string& name) const {
     return op_.Inputs(name);
   }
 
-  //! Get variables vector with same output name.
+  //! Get actual name vector for this output.
   const std::vector<std::string>& Outputs(const std::string& name) const {
     return op_.Outputs(name);
   }
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 3744d1b470..d39cb2fcf9 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -30,6 +30,11 @@ class NCCLInitOp : public framework::OperatorBase {
                             "Can not find variable '%s' in the scope.", name);
     std::vector<int> gpus = Attr<std::vector<int>>("gpus");
     PADDLE_ENFORCE(!gpus.empty(), "Attr(gpus) should not be empty.");
+
+    if (scope.FindVar(name) == nullptr) {
+      PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
+    }
+
     platform::Communicator *comm =
         scope.FindVar(name)->GetMutable<platform::Communicator>();
     comm->InitAll(gpus);
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu
index f8b3b8a8ba..86dee8ee8e 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@@ -9,7 +9,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include <functional>
 
 #include "paddle/framework/lod_tensor.h"
@@ -60,7 +59,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     } else if (reduction == "ncclProd") {
       reduction_op_ = ncclProd;
     } else {
-      PADDLE_ENFORCE(false, "Invalid reduction. default ncclSum.");
+      PADDLE_THROW("Invalid reduction. default ncclSum.");
     }
 
     auto* comm = ctx.Input<Communicator>("Communicator");
@@ -113,7 +112,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     } else if (reduction == "ncclProd") {
       reduction_op_ = ncclProd;
     } else {
-      PADDLE_ENFORCE(false, "Invalid reduction. default ncclSum.");
+      PADDLE_THROW("Invalid reduction. default ncclSum.");
     }
 
     int root = ctx.Attr<int>("root");
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 63a286f602..80c50a28a9 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -12,8 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <algorithm>
@@ -193,7 +191,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
   }
 }
 
-// ncclAReduceOp with desc
+// ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
   const int kRoot = 0;
@@ -201,7 +199,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", {kRoot});
+  op2->SetAttr("root", kRoot);
 
   std::vector<f::Scope *> dev_scopes;
 
@@ -241,7 +239,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
   }
 }
 
-// // ncclBcastOp with desc
+// ncclBcastOp with desc
 TEST_F(NCCLTester, ncclBcastOp) {
   std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
   const int kRoot = 5;
@@ -249,7 +247,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", {kRoot});
+  op2->SetAttr("root", kRoot);
 
   std::vector<f::Scope *> dev_scopes;
 

From 0049ce047961fafc284a3692c1895028fe758ec2 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 30 Oct 2017 05:58:06 +0800
Subject: [PATCH 59/81] 03 image classification (#5192)

* add batch_norm_layer

* add img_conv_group layer and test

* add check to Tensor.type()

* forward can run

* with backward

* change label data time from int32 to int64

* refine code

* follow comment
---
 paddle/framework/operator.h                   |   1 +
 paddle/framework/tensor.h                     |   9 +-
 paddle/operators/batch_norm_op.cc             |  30 +++-
 paddle/operators/reshape_op.cc                |  12 +-
 paddle/operators/reshape_op.h                 |   7 +-
 python/paddle/v2/framework/framework.py       |   5 +-
 python/paddle/v2/framework/layers.py          |  91 ++++++++++++
 python/paddle/v2/framework/nets.py            |  71 +++++++++-
 .../tests/test_image_classification_layer.py  |  75 ++++++++++
 .../tests/test_image_classification_train.py  | 133 ++++++++++++++++++
 10 files changed, 418 insertions(+), 16 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_image_classification_layer.py
 create mode 100644 python/paddle/v2/framework/tests/test_image_classification_train.py

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 1294e06fb1..93885fa302 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -408,6 +408,7 @@ class OperatorWithKernel : public OperatorBase {
   // indicate kernel DataType by input data. Defaultly all input data must be
   // same.
   virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
+    VLOG(3) << "Default IndicateDataType " << this->Type();
     auto& scope = ctx.scope();
     int data_type = -1;
     for (auto& input : this->inputs_) {
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 9d2dc6a32b..7b9a5b75e1 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -126,11 +126,16 @@ class Tensor {
   inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
 
   platform::Place place() const {
-    PADDLE_ENFORCE_NOT_NULL(holder_, "Tensor get place() must contains holder");
+    PADDLE_ENFORCE_NOT_NULL(
+        holder_, "Tensor not initialized yet when Tensor::place() is called.");
     return holder_->place();
   }
 
-  std::type_index type() const { return holder_->type(); }
+  std::type_index type() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        holder_, "Tensor not initialized yet when Tensor::type() is called.");
+    return holder_->type();
+  }
 
   size_t memory_size() const;
 
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index f7dc990f0d..f2c8be4c54 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
@@ -64,6 +65,9 @@ class BatchNormOp : public framework::OperatorWithKernel {
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
                                              : x_dims[x_dims.size() - 1]);
 
+    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
+                   "Input x must have 3 to 5 dimensions.");
+
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@@ -108,10 +112,12 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
               "Store the global Variance when training");
     AddOutput("SavedMean",
               "Mean of the current mini batch, "
-              "will apply to output when training");
+              "will apply to output when training")
+        .AsIntermediate();
     AddOutput("SavedVariance",
               "Variance of the current mini batch, "
-              "will apply to output when training");
+              "will apply to output when training")
+        .AsIntermediate();
     AddComment(R"DOC(
 https://arxiv.org/pdf/1502.03167.pdf
 
@@ -135,7 +141,6 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-
     PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
                    "The Input dim size should be between 3 and 5");
     const int N = x_dims[0];
@@ -289,6 +294,25 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
     ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext &ctx) const override {
+    VLOG(3) << "IndicateDataType " << this->Type();
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::ToDataType(t->type());
+  }
 };
 
 template <typename T>
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index a8eb8d45ee..eda8226480 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -34,13 +34,19 @@ class ReshapeOp : public framework::OperatorWithKernel {
 
     auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
     PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
-    for (auto dim : shape) {
-      PADDLE_ENFORCE(dim > 0, "Each dimension of shape must be positive.");
+    auto x_dims = ctx->GetInputDim("X");
+    // TODO(qiao) change batch_size
+    for (int i = 1; i < shape.size(); ++i) {
+      PADDLE_ENFORCE(shape[i] > 0,
+                     "Each dimension of shape "
+                     "must be positiv except the first.");
+    }
+    if (shape[0] < 0) {
+      shape[0] = x_dims[0];
     }
     // capacity check
     int64_t capacity =
         std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    auto x_dims = ctx->GetInputDim("X");
     int64_t in_size = framework::product(x_dims);
     PADDLE_ENFORCE_EQ(capacity, in_size,
                       "The size of Input(X) mismatches with Attr(shape).");
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index c89cdf8cab..beb951713a 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -26,13 +26,8 @@ class ReshapeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out = ctx.Output<framework::Tensor>("Out");
     auto* in = ctx.Input<framework::Tensor>("X");
+    auto out_dims = out->dims();
     out->mutable_data<T>(ctx.GetPlace());
-
-    auto shape = ctx.Attr<std::vector<int>>("shape");
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto out_dims = framework::make_ddim(shape_int64);
     out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context());
     out->Resize(out_dims);
   }
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py
index 348c393913..43101c9dda 100644
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
@@ -352,7 +352,10 @@ class Block(object):
         return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)}
 
     def create_var(self, *args, **kwargs):
-        return Variable(self, *args, **kwargs)
+        var = Variable(self, *args, **kwargs)
+        if 'init_attr' in kwargs:
+            self._prepend_initialize_ops_(var, kwargs['init_attr'])
+        return var
 
     def has_var(self, name):
         return name in self.vars
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 9e6d5f49db..041a3b2c0b 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -161,6 +161,7 @@ def _create_op_func_(op_type):
 _create_op_func_('mean')
 _create_op_func_('mul')
 _create_op_func_('dropout')
+_create_op_func_('reshape')
 
 
 def cast(x, data_type, program=None):
@@ -308,6 +309,96 @@ def pool2d(input,
     return pool_out
 
 
+def batch_norm(input,
+               act=None,
+               is_test=False,
+               momentum=0.9,
+               epsilon=1e05,
+               param_attr=None,
+               bias_attr=None,
+               data_layout='NCHW',
+               program=None,
+               init_program=None):
+    helper = LayerHelper('batch_norm', **locals())
+    dtype = helper.input_dtype()
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    def get_init_attr(value):
+        if not isinstance(value, float):
+            raise ValueError("attr value should be a float")
+        return {'type': 'fill_constant', 'value': value}
+
+    def prepend_init_op(var, init_attr):
+        assert isinstance(var, Variable)
+        op_type = init_attr['type']
+        init_attr['shape'] = var.shape
+        init_attr['data_type'] = int(var.data_type)
+        op = var.block.prepend_op(
+            type=op_type, inputs=None, outputs={'Out': [var]}, attrs=init_attr)
+        return op
+
+    def create_persistable_var(dtype, shape, init_attr=None):
+        name = unique_name(".".join([helper.name, "xxxx"]))
+        var = init_program.global_block().create_var(
+            dtype=dtype, shape=shape, name=name, persistable=True)
+        if 'init_attr' is not None:
+            prepend_init_op(var, init_attr)
+        return program.global_block().create_var(
+            name=name, dtype=dtype, shape=shape, persistable=True)
+
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr, shape=param_shape, dtype=dtype)
+    bias = helper.create_parameter(
+        attr=helper.param_attr, shape=param_shape, dtype=dtype)
+
+    # create input
+    mean = create_persistable_var(dtype, param_shape, get_init_attr(0.0))
+    variance = create_persistable_var(dtype, param_shape, get_init_attr(1.0))
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_tmp_variable(dtype)
+    saved_variance = helper.create_tmp_variable(dtype)
+
+    batch_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="batch_norm",
+        inputs={
+            "X": input,
+            "Scale": scale,
+            "Bias": bias,
+            "Mean": mean,
+            "Variance": variance
+        },
+        outputs={
+            "Y": batch_norm_out,
+            "MeanOut": mean_out,
+            "VarianceOut": variance_out,
+            "SavedMean": saved_mean,
+            "SavedVariance": saved_variance
+        },
+        attrs={"momentum": momentum,
+               "epsilon": epsilon,
+               "is_test": is_test})
+
+    return helper.append_activation(batch_norm_out)
+
+
 class BlockGuard(object):
     """
     BlockGuard used to create sub-block in program by using Python `with` 
diff --git a/python/paddle/v2/framework/nets.py b/python/paddle/v2/framework/nets.py
index 8a83ebfb96..803534fa39 100644
--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
@@ -7,6 +7,7 @@ def simple_img_conv_pool(input,
                          pool_size,
                          pool_stride,
                          act,
+                         pool_type='max',
                          program=None,
                          init_program=None):
     conv_out = layers.conv2d(
@@ -20,7 +21,75 @@ def simple_img_conv_pool(input,
     pool_out = layers.pool2d(
         input=conv_out,
         pool_size=pool_size,
-        pool_type='max',
+        pool_type=pool_type,
+        pool_stride=pool_stride,
+        program=program,
+        init_program=init_program)
+    return pool_out
+
+
+def img_conv_group(input,
+                   conv_num_filter,
+                   pool_size,
+                   conv_padding=1,
+                   conv_filter_size=3,
+                   conv_act=None,
+                   conv_with_batchnorm=False,
+                   conv_batchnorm_drop_rate=None,
+                   pool_stride=1,
+                   pool_type=None,
+                   program=None,
+                   init_program=None):
+    """
+    Image Convolution Group, Used for vgg net.
+    """
+    tmp = input
+    assert isinstance(conv_num_filter, list) or \
+           isinstance(conv_num_filter, tuple)
+
+    def __extend_list__(obj):
+        if not hasattr(obj, '__len__'):
+            return [obj] * len(conv_num_filter)
+        else:
+            return obj
+
+    conv_padding = __extend_list__(conv_padding)
+    conv_filter_size = __extend_list__(conv_filter_size)
+    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
+    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
+
+    for i in xrange(len(conv_num_filter)):
+        local_conv_act = conv_act
+        if conv_with_batchnorm[i]:
+            local_conv_act = None
+
+        tmp = layers.conv2d(
+            input=tmp,
+            num_filters=conv_num_filter[i],
+            filter_size=conv_filter_size[i],
+            padding=conv_padding[i],
+            act=local_conv_act,
+            program=program,
+            init_program=init_program)
+
+        if conv_with_batchnorm[i]:
+            tmp = layers.batch_norm(
+                input=tmp,
+                act=conv_act,
+                program=program,
+                init_program=init_program)
+            drop_rate = conv_batchnorm_drop_rate[i]
+            if abs(drop_rate) > 1e-5:
+                tmp = layers.dropout(
+                    x=tmp,
+                    dropout_prob=drop_rate,
+                    program=program,
+                    init_program=init_program)
+
+    pool_out = layers.pool2d(
+        input=tmp,
+        pool_size=pool_size,
+        pool_type=pool_type,
         pool_stride=pool_stride,
         program=program,
         init_program=init_program)
diff --git a/python/paddle/v2/framework/tests/test_image_classification_layer.py b/python/paddle/v2/framework/tests/test_image_classification_layer.py
new file mode 100644
index 0000000000..908cf44b88
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
@@ -0,0 +1,75 @@
+import unittest
+
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.nets as nets
+from paddle.v2.framework.framework import Program
+
+
+def conv_block(input,
+               num_filter,
+               groups,
+               dropouts,
+               program=None,
+               init_program=None):
+    return nets.img_conv_group(
+        input=input,
+        pool_size=2,
+        pool_stride=2,
+        conv_num_filter=[num_filter] * groups,
+        conv_filter_size=3,
+        conv_act='relu',
+        conv_with_batchnorm=True,
+        conv_batchnorm_drop_rate=dropouts,
+        pool_type='max',
+        program=program,
+        init_program=init_program)
+
+
+class TestLayer(unittest.TestCase):
+    def test_batch_norm_layer(self):
+        program = Program()
+        init_program = Program()
+        images = layers.data(
+            name='pixel',
+            shape=[3, 48, 48],
+            data_type='float32',
+            program=program)
+        layers.batch_norm(
+            input=images, program=program, init_program=init_program)
+
+        #print str(program)
+
+    def test_dropout_layer(self):
+        program = Program()
+        init_program = Program()
+        images = layers.data(
+            name='pixel',
+            shape=[3, 48, 48],
+            data_type='float32',
+            program=program)
+        layers.dropout(
+            x=images,
+            dropout_prob=0.5,
+            program=program,
+            init_program=init_program)
+
+        #print str(program)
+
+    def test_img_conv_group(self):
+        program = Program()
+        init_program = Program()
+
+        images = layers.data(
+            name='pixel',
+            shape=[3, 48, 48],
+            data_type='float32',
+            program=program,
+            init_program=init_program)
+        conv1 = conv_block(images, 64, 2, [0.3, 0], program, init_program)
+        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], program, init_program)
+
+        # print str(program)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_image_classification_train.py b/python/paddle/v2/framework/tests/test_image_classification_train.py
new file mode 100644
index 0000000000..4eb9051261
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_image_classification_train.py
@@ -0,0 +1,133 @@
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.nets as nets
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+
+from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.executor import Executor
+
+import numpy as np
+
+
+def vgg16_bn_drop(input, program, init_program):
+    def conv_block(input,
+                   num_filter,
+                   groups,
+                   dropouts,
+                   program=None,
+                   init_program=None):
+        return nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max',
+            program=program,
+            init_program=init_program)
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0], program, init_program)
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0], program, init_program)
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0], program, init_program)
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0], program, init_program)
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0], program, init_program)
+
+    drop = layers.dropout(
+        x=conv5, dropout_prob=0.5, program=program, init_program=init_program)
+    fc1 = layers.fc(input=drop,
+                    size=512,
+                    act=None,
+                    program=program,
+                    init_program=init_program)
+    reshape1 = layers.reshape(
+        x=fc1,
+        shape=list(fc1.shape + (1, 1)),
+        program=program,
+        init_program=init_program)
+    bn = layers.batch_norm(
+        input=reshape1, act='relu', program=program, init_program=init_program)
+    drop2 = layers.dropout(
+        x=bn, dropout_prob=0.5, program=program, init_program=init_program)
+    fc2 = layers.fc(input=drop2,
+                    size=512,
+                    act=None,
+                    program=program,
+                    init_program=init_program)
+    return fc2
+
+
+init_program = Program()
+program = Program()
+
+classdim = 10
+data_shape = [3, 32, 32]
+
+images = layers.data(
+    name='pixel', shape=data_shape, data_type='float32', program=program)
+
+label = layers.data(
+    name='label',
+    shape=[1],
+    data_type='int64',
+    program=program,
+    init_program=init_program)
+vgg_net = vgg16_bn_drop(images, program, init_program)
+predict = layers.fc(input=vgg_net,
+                    size=classdim,
+                    act='softmax',
+                    program=program,
+                    init_program=init_program)
+cost = layers.cross_entropy(
+    input=predict, label=label, program=program, init_program=init_program)
+avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+
+sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+opts = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 128
+PASS_NUM = 1
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.cifar.train10(), buf_size=128 * 10),
+    batch_size=BATCH_SIZE)
+
+place = core.CPUPlace()
+exe = Executor(place)
+
+exe.run(init_program, feed={}, fetch_list=[])
+
+for pass_id in range(PASS_NUM):
+    batch_id = 0
+    for data in train_reader():
+        img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                data)).astype("float32")
+        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+        batch_size = 1
+        for i in y_data.shape:
+            batch_size = batch_size * i
+        y_data = y_data.reshape([batch_size, 1])
+
+        tensor_img = core.LoDTensor()
+        tensor_y = core.LoDTensor()
+        tensor_img.set(img_data, place)
+        tensor_y.set(y_data, place)
+
+        outs = exe.run(program,
+                       feed={"pixel": tensor_img,
+                             "label": tensor_y},
+                       fetch_list=[avg_cost])
+
+        loss = np.array(outs[0])
+        # print("pass_id:" + str(pass_id) + " batch_id:" + str(batch_id) +
+        #       " loss:" + str(loss))
+        batch_id = batch_id + 1
+
+        if batch_id > 1:
+            # this model is slow, so if we can train two mini batch, we think it works properly.
+            exit(0)
+exit(1)

From fab6f30ff62a14332903660a404f6b0d5f08be1c Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 30 Oct 2017 09:51:08 +0800
Subject: [PATCH 60/81] Add empty sequence case in unitest

---
 python/paddle/v2/framework/tests/test_seq_expand.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/framework/tests/test_seq_expand.py
index 901102802b..ff17edd04b 100644
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
@@ -50,5 +50,14 @@ class TestSeqExpandCase2(TestSeqExpand):
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
+class TestSeqExpandCase3(TestSeqExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
+        x_lod = [[0, 1, 2, 3, 4]]
+        y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
+        y_lod = [[0, 2, 4, 4, 6]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+
 if __name__ == '__main__':
     unittest.main()

From 8d4e2d4cb37b190c16fbc35e2528f6caa536d53f Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 30 Oct 2017 11:46:47 +0800
Subject: [PATCH 61/81] 1. Add unitest for empty sequence case 2. Fix comments
 and paddle enforce check

---
 paddle/operators/seq_expand_op.cc | 32 ++++++++++++++++++++++++-------
 paddle/operators/seq_expand_op.h  | 17 ++++++++++++----
 2 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index 660e86e9cc..def5efa0e8 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -25,10 +25,8 @@ class SeqExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SeqExpandOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SeqExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasOutput("Out"));
     PADDLE_ENFORCE(
         ctx->HasInput("Y"),
         "Input(Y) of SeqExpandOp should not be null while repeat == 0.");
@@ -54,7 +52,7 @@ class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
              "The element numbers of last level in input('Y') "
              "must be equal to dims[0] of input('X').");
     AddOutput("Out",
-              "The output of seq_expand op."
+              "(LodTensor)The output of seq_expand op."
               "The lod of output will be as same as input(Y)'s lod.");
     AddComment(R"DOC(
 Expand input(X) according to LOD of input(Y).
@@ -69,6 +67,7 @@ Given 2-level a LoDTensor input(X)
 and input(Y)
     Y.lod = [[0,    2,    4],
              [0, 3, 6, 7, 8]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
 then we get 2-level LoDTensor
     Out.lod = [[0,                2,    4],
                [0,       3,       6, 7, 8]]
@@ -83,6 +82,7 @@ Given a 0-level LoDTensor input(X)
     X.dims = [3, 1]
 and input(Y)
     Y.lod = [[0, 2, 3, 6]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
 then we get 1-level LoDTensor
     Out.lod = [[0,    2, 3,      6]]
     Out.data = [a, a, b, c, c, c]
@@ -96,11 +96,29 @@ Given a 0-level LoDTensor input(X)
     X.dims = [3, 2]
 and input(Y)
     Y.lod = [[0, 2, 3, 6]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
 then we get 1-level LoDTensor
     Out.lod = [[0,           2,     3,                     6]]
     Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]]
     Out.dims = [6, 2]
 
+Case 4:
+
+Given 2-level a LoDTensor input(X)
+    X.lod = [[0,       2, 3],
+             [0, 1,    3, 4]]
+    X.data = [a, b, c, d]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0,    2,    4],
+             [0, 3, 6, 6, 8]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
+then we get 2-level LoDTensor
+    Out.lod = [[0,                2,    4],
+               [0,       3,       6, 6, 8]]
+    Out.data = [a, a, a, b, b, b, d, d]
+    Out.dims = [8, 1]
+
 
 )DOC");
   }
@@ -112,8 +130,8 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasInput("Out"));
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
     auto x_dims = ctx->GetInputDim("X");
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index ad3f42116d..aa91e0f929 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -36,7 +36,6 @@ class SeqExpandKernel : public framework::OpKernel<T> {
                       "The size of last lod level in Input(Y)"
                       "must be equal to dims[0] of Input(X).");
     out->set_lod(y->lod());
-    out->Resize(y->dims());
     auto place = context.GetEigenDevice<Place>();
     size_t element_len = framework::product(x_dims) / x_dims[0];
     T* out_data = out->mutable_data<T>(context.GetPlace());
@@ -57,6 +56,18 @@ class SeqExpandKernel : public framework::OpKernel<T> {
   }
 };
 
+/*
+ *Given Grad(Out)
+ *
+ *    Grad(Out).lod = [[0,                            2],
+ *                     [0,              3,            6]]
+ *    Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
+ * Then
+ *    Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)]
+ *                 = [0.6, 1.5]
+ *    Grad(X).lod = Input(X).lod
+ *
+ * */
 template <typename Place, typename T>
 class SeqExpandGradKernel : public framework::OpKernel<T> {
  public:
@@ -68,10 +79,8 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
     auto out_last_level = out->lod().back();
     d_x->set_lod(x->lod());
     const T* d_out_data = d_out->data<T>();
-    auto d_out_dims = d_out->dims();
     T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-    size_t element_len = framework::product(d_out_dims) / d_out_dims[0];
-
+    size_t element_len = d_out->numel() / d_out->dims()[0];
     for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
       size_t repeat = out_last_level[i + 1] - out_last_level[i];
       Eigen::TensorMap<

From 7942984f8548d84042ed614890bbb4da8942cc61 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 30 Oct 2017 11:47:33 +0800
Subject: [PATCH 62/81] follow comments

---
 paddle/operators/sequence_conv_op.cc | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index a73ceb4157..f086313411 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -117,10 +117,11 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
              "sequence according to context_length, context_stride and "
              "context_start")
         .AsDispensable();
-    AddInput("Filter",
-             "(Tensor) the input(Filter) is an learnable parameter."
-             "This is a tensor with shape (N, D), where N is the "
-             "context_length, D is the output feature size.");
+    AddInput(
+        "Filter",
+        "(Tensor) the input(Filter) is an learnable parameter."
+        "This is a tensor with shape (N, D), where N is the "
+        "context_length * input_hidden_size, D is the output feature size.");
     AddOutput(
         "Out",
         "(LoDTensor) the output(Out) is a LodTensor, which support "
@@ -133,18 +134,21 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
                   "is trainable or not.")
         .SetDefault(false);
     AddAttr<int>("contextLength",
-                 "(int, default 3) the contextLength of SequenceConvOp is the "
+                 "(int) the contextLength of SequenceConvOp is the "
                  "height of the convolution kernel.")
-        .SetDefault(3)
         .GreaterThan(0);
     AddAttr<int>("contextStart",
                  "(int, default 0) the contextStart of SequenceConvOp "
                  "represents the beginning of the convolution of the number of "
-                 "rows of sequence, which can be negative.")
+                 "rows of sequence, which can be negative. The negative number "
+                 "means to pad contextStart time-steps of zeros or learnable "
+                 "parameters at the beginning of each instance. The positive "
+                 "number means to skip contextStart time-steps of each "
+                 "instance.")
         .SetDefault(0);
     AddAttr<int>("contextStride",
                  "(int, default 1) the contextStride of SequenceConvOp "
-                 "represents the step length of convolution. "
+                 "represents the stride length of convolution kernel. "
                  "Currently, SequenceConvOp only supports"
                  "contextStride=1.")
         .SetDefault(1)

From 84f471b42e7e8681c95453a01b0f7a1db0fd5125 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 30 Oct 2017 13:44:26 +0800
Subject: [PATCH 63/81] Fix comments

---
 paddle/operators/seq_expand_op.cc | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index def5efa0e8..08fda9b445 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -27,9 +27,7 @@ class SeqExpandOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"));
     PADDLE_ENFORCE(ctx->HasOutput("Out"));
-    PADDLE_ENFORCE(
-        ctx->HasInput("Y"),
-        "Input(Y) of SeqExpandOp should not be null while repeat == 0.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"));
     framework::DDim out_dim;
     out_dim = ctx->GetInputDim("Y");
     ctx->ShareLoD("Y", "Out");
@@ -43,14 +41,14 @@ class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
                    framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(Tensor or LoDTensor) The input('X') of this operator can be a "
+             "(Tensor or LoDTensor) The input(X) of this operator can be a "
              "LoDTensor or a base Tensor.");
     AddInput("Y",
-             "(LoDTensor)The reference input('Y') of seq_expand op."
+             "(LoDTensor)The reference input(Y) of seq_expand op."
              "It must be a LoDTensor with k-level(k>0)."
-             "Input(X) will be expanded according to LOD of input(Y)."
-             "The element numbers of last level in input('Y') "
-             "must be equal to dims[0] of input('X').");
+             "The input(X) will be expanded according to LOD of input(Y)."
+             "The element numbers of last level in input(Y) "
+             "must be equal to dims[0] of input(X).");
     AddOutput("Out",
               "(LodTensor)The output of seq_expand op."
               "The lod of output will be as same as input(Y)'s lod.");
@@ -133,7 +131,7 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("X"));
     PADDLE_ENFORCE(ctx->HasInput("Out"));
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
+                   "The input(Out@GRAD) should not be null");
     auto x_dims = ctx->GetInputDim("X");
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {

From b08ae0b1dc5eaa36c39eb1bacc641072cc9f0b9e Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 30 Oct 2017 16:57:12 +0800
Subject: [PATCH 64/81] fix code format and doc

---
 paddle/operators/math/context_project.h | 115 +++++++++++-------------
 paddle/operators/sequence_conv_op.cc    |  32 +++----
 paddle/operators/sequence_conv_op.h     |  20 ++---
 3 files changed, 77 insertions(+), 90 deletions(-)

diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
index 7d9cdab2cf..e028336041 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
@@ -16,34 +16,36 @@ limitations under the License. */
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/tensor.h"
 #include "paddle/operators/math/im2col.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 /*
- * \brief Context projection concatenate features in adjacent time steps in
+ * \brief Context projection concatenates features in adjacent time-steps in
  * a sequence. The i-th row of the output is the concatenation of
  * context_length rows of the input. The context_length rows are the
  * consecutive rows from the i+shift_start row.
  * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor.
-
+ *
  * \param in            Input data.
- * \param Shape         The shape of Input data,
- *                      [minibatch, input_hidden_size].
+ * \param Shape         The shape of Input data:
+ *                        [mini-batch, input_hidden_size].
  *
  * \param padding_data  Padding data.
- * \param Shape         The shape of Padding data,
- *                      [up_pad + down_pad, input_hidden_size].
+ * \param Shape         The shape of Padding data:
+ *                        [up_pad + down_pad, input_hidden_size].
  *
  * \param col           Col data.
- * \param Shape         The shape of Col data,
- *                      [minibatch, context_length * input_hidden_size].
+ * \param Shape         The shape of Col data:
+ *                        [mini-batch, context_length * input_hidden_size].
  *
  * For a mini-batch of 2 variable lengths sentences, containing 3, and 1
  * time-steps:
@@ -61,40 +63,37 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
  * representation is 2.
  *
  * - Case1:
- * If context_start is -1 and padding_trainable is false, we use zero to pad
- * instead of learned weight to pad,
- * and the context_lenth is 3, the output (Out) is:
+ *   If context_start is -1 and padding_trainable is false, we use zero to pad
+ *   instead of learned weight to pad,
+ *   and the context_length is 3, the output (Out) is:
  *
- * Out =[[0,  0,  a1, a2, b1, b2;
- *        a1, a2, b1, b2, c1, c2;
- *        b1, b2, c1, c2, 0,  0 ]
- *       [0,  0,  d1, d2, 0,  0 ]]
+ *   Out =[[0,  0,  a1, a2, b1, b2;
+ *          a1, a2, b1, b2, c1, c2;
+ *          b1, b2, c1, c2, 0,  0 ]
+ *          [0,  0, d1, d2, 0,  0 ]]
  *
  * - Case2:
- * If context_start is -1 and padding_trainable is true, we use learned weight
- * to pad,
- * and the context_lenth is 3, the output (Out) is:
+ *   If context_start is -1 and padding_trainable is true, we use learned weight
+ *   to pad,
+ *   and the context_length is 3, the output (Out) is:
  *
- * Out = [[w1, w2, a1, a2, b1, b2;
- *         a1, a2, b1, b2, c1, c2;
- *         b1, b2, c1, c2, w3, w4]
- *        [w1, w2, d1, d2, w3, w4]]
+ *   Out = [[w1, w2, a1, a2, b1, b2;
+ *           a1, a2, b1, b2, c1, c2;
+ *           b1, b2, c1, c2, w3, w4]
+ *          [w1, w2, d1, d2, w3, w4]]
  *
  */
 
 template <typename Place, typename T>
 class ContextProjectFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::LoDTensor& in,
-                  const framework::Tensor& padding_data, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context, const LoDTensor& in,
+                  const Tensor& padding_data, Tensor& col,
                   bool padding_trainable, int context_start, int context_length,
                   int context_stride, int up_pad, int down_pad) {
     auto lod_level_0 = in.lod()[0];
 
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kOCF, Place, float>
-        im2col_ocf;
+    math::Im2ColFunctor<math::ColFormat::kOCF, Place, float> im2col_ocf;
 
     int input_row_begin, input_row_end;
     int sequence_height, sequence_width;
@@ -106,19 +105,18 @@ class ContextProjectFunctor {
                             : static_cast<int>(lod_level_0[i]);
       input_row_end = static_cast<int>(lod_level_0[i + 1]);
 
-      framework::Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]),
-                                          static_cast<int>(lod_level_0[i + 1]));
+      Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]),
+                               static_cast<int>(lod_level_0[i + 1]));
 
       sequence_height = static_cast<int>(out_t.dims()[0]);
 
       if (input_row_begin < input_row_end) {
-        framework::Tensor in_t = in.Slice(input_row_begin, input_row_end);
+        Tensor in_t = in.Slice(input_row_begin, input_row_end);
 
         std::vector<int64_t> output_shape(
             {sequence_height, 1, 1, context_length,
              sequence_width});  // output_height, output_width,
         // input_channels, filter_height, filter_width
-
         out_t.Resize(framework::make_ddim(output_shape));
 
         std::vector<int64_t> input_shape(
@@ -134,9 +132,8 @@ class ContextProjectFunctor {
     }
     if (padding_trainable) {
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-        framework::Tensor out_t =
-            col.Slice(static_cast<int>(lod_level_0[i]),
-                      static_cast<int>(lod_level_0[i + 1]));
+        Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]),
+                                 static_cast<int>(lod_level_0[i + 1]));
 
         sequence_height = static_cast<int>(out_t.dims()[0]);
 
@@ -150,10 +147,9 @@ class ContextProjectFunctor {
           for (int k = 0; k < padding_rows; ++k) {
             int padding_size =
                 k + context_length < up_pad ? context_length : up_pad - k;
-            framework::Tensor out_t_sub = out_t.Slice(
-                k * context_length, k * context_length + padding_size);
-            framework::Tensor w_sub = padding_data.Slice(k, k + padding_size);
-            // in this block, using EigenVector<T>::Flatten is ok too.
+            Tensor out_t_sub = out_t.Slice(k * context_length,
+                                           k * context_length + padding_size);
+            Tensor w_sub = padding_data.Slice(k, k + padding_size);
             auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
             auto w_sub_e = EigenMatrix<T>::From(w_sub);
             out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
@@ -180,10 +176,11 @@ class ContextProjectFunctor {
             }
             if (padding_begin > 0 || sequence_height == context_start)
               padding_idx = padding_begin + t;
-            framework::Tensor out_t_sub = out_t.Slice(
+
+            Tensor out_t_sub = out_t.Slice(
                 (down_pad_begin_row + t) * context_length - padding_size,
                 (down_pad_begin_row + t) * context_length);
-            framework::Tensor w_sub = padding_data.Slice(
+            Tensor w_sub = padding_data.Slice(
                 up_pad + padding_idx, up_pad + padding_idx + padding_size);
             auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
             auto w_sub_e = EigenMatrix<T>::From(w_sub);
@@ -199,16 +196,13 @@ class ContextProjectFunctor {
 template <typename Place, typename T>
 class ContextProjectGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  framework::LoDTensor& in, framework::Tensor& padding_data,
-                  framework::Tensor& col, bool padding_trainable,
+  void operator()(const platform::DeviceContext& context, LoDTensor& in,
+                  Tensor& padding_data, Tensor& col, bool padding_trainable,
                   int context_start, int context_length, int context_stride,
                   int up_pad, int down_pad, bool input_grad, bool pad_grad) {
     auto lod_level_0 = in.lod()[0];
 
-    paddle::operators::math::Col2ImFunctor<
-        paddle::operators::math::ColFormat::kOCF, Place, float>
-        col2im_ocf;
+    math::Col2ImFunctor<math::ColFormat::kOCF, Place, float> col2im_ocf;
 
     int input_row_begin, input_row_end;
     int sequence_height, sequence_width;
@@ -221,20 +215,18 @@ class ContextProjectGradFunctor {
                               : static_cast<int>(lod_level_0[i]);
         input_row_end = static_cast<int>(lod_level_0[i + 1]);
 
-        framework::Tensor out_t =
-            col.Slice(static_cast<int>(lod_level_0[i]),
-                      static_cast<int>(lod_level_0[i + 1]));
+        Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]),
+                                 static_cast<int>(lod_level_0[i + 1]));
 
         sequence_height = static_cast<int>(out_t.dims()[0]);
 
         if (input_row_begin < input_row_end) {
-          framework::Tensor in_t = in.Slice(input_row_begin, input_row_end);
+          Tensor in_t = in.Slice(input_row_begin, input_row_end);
 
           std::vector<int64_t> output_shape(
               {sequence_height, 1, 1, context_length,
                sequence_width});  // output_height, output_width,
           // input_channels, filter_height, filter_width
-
           out_t.Resize(framework::make_ddim(output_shape));
 
           std::vector<int64_t> input_shape(
@@ -252,9 +244,8 @@ class ContextProjectGradFunctor {
     if (pad_grad) {
       if (padding_trainable) {
         for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-          framework::Tensor out_t =
-              col.Slice(static_cast<int>(lod_level_0[i]),
-                        static_cast<int>(lod_level_0[i + 1]));
+          Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]),
+                                   static_cast<int>(lod_level_0[i + 1]));
 
           sequence_height = static_cast<int>(out_t.dims()[0]);
           out_t.Resize({sequence_height * context_length, sequence_width});
@@ -266,10 +257,9 @@ class ContextProjectGradFunctor {
             for (int k = 0; k < padding_rows; ++k) {
               int padding_size =
                   k + context_length < up_pad ? context_length : up_pad - k;
-              framework::Tensor out_t_sub = out_t.Slice(
-                  k * context_length, k * context_length + padding_size);
-              framework::Tensor w_sub = padding_data.Slice(k, k + padding_size);
-              // in this block, using EigenVector<T>::Flatten is ok too.
+              Tensor out_t_sub = out_t.Slice(k * context_length,
+                                             k * context_length + padding_size);
+              Tensor w_sub = padding_data.Slice(k, k + padding_size);
               auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
               auto w_sub_e = EigenMatrix<T>::From(w_sub);
               w_sub_e.device(*context.GetEigenDevice<Place>()) =
@@ -298,10 +288,11 @@ class ContextProjectGradFunctor {
               }
               if (padding_begin > 0 || sequence_height == context_start)
                 padding_idx = padding_begin + t;
-              framework::Tensor out_t_sub = out_t.Slice(
+
+              Tensor out_t_sub = out_t.Slice(
                   (down_pad_begin_row + t) * context_length - padding_size,
                   (down_pad_begin_row + t) * context_length);
-              framework::Tensor w_sub = padding_data.Slice(
+              Tensor w_sub = padding_data.Slice(
                   up_pad + padding_idx, up_pad + padding_idx + padding_size);
               auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
               auto w_sub_e = EigenMatrix<T>::From(w_sub);
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index f086313411..bdb52265a5 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -31,18 +31,19 @@ class SequenceConvOp : public framework::OperatorWithKernel {
                    "Output(Out) of SequenceConvOp should not be null.");
 
     int context_length = ctx->Attrs().Get<int>("contextLength");
-    bool padding_trainable = ctx->Attrs().Get<bool>("paddingTrainable");
     int context_start = ctx->Attrs().Get<int>("contextStart");
 
     auto in_dims = ctx->GetInputDim("X");
     auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE(ctx->Attrs().Get<int>("contextStride") == 1,
+                   "Currently, SequenceConvOp only supports contextStride=1.");
     PADDLE_ENFORCE(in_dims.size() == 2 && filter_dims.size() == 2,
                    "Input(X, Filter) should be 2-D tensor.");
     PADDLE_ENFORCE(filter_dims[0] == context_length * in_dims[1],
                    "Filter's height should be context_length * "
-                   "number_of_input_features .");
+                   "input_hidden_size .");
 
-    if (padding_trainable) {
+    if (ctx->Attrs().Get<bool>("paddingTrainable")) {
       PADDLE_ENFORCE(
           ctx->HasInput("PaddingData"),
           "Input(PaddingData) of SequenceConvOp should not be null.");
@@ -88,6 +89,7 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {
     }
     if (ctx->HasOutput(framework::GradVarName("X"))) {
       ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD(framework::GradVarName("X"), "X");
     }
     if (ctx->HasOutput(framework::GradVarName("Filter"))) {
       ctx->SetOutputDim(framework::GradVarName("Filter"),
@@ -105,13 +107,13 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
         "X",
         "(LoDTensor) the input(X) is a LodTensor, which support "
         "variable-time length input sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, D), where, T is the "
-        "total time steps in this mini-batch, D is the input feature size.");
+        "this LoDTensor is a matrix with shape (T, N), where, T is the "
+        "total time steps in this mini-batch, N is the input_hidden_size.");
     AddInput("PaddingData",
              "(Tensor, optional) the input(PaddingData) is an optional "
              "parameter, and it is learnable. "
-             "This is a tensor with shape (N, D), where N is the "
-             "top_pad + bottom_pad, D is the input feature size. In order to "
+             "This is a tensor with shape (P, N), where P is the "
+             "top_pad + bottom_pad, N is the input_hidden_size. In order to "
              "ensure the equal length of sequence before and after "
              "convolution, it is necessary to fill the top and bottom of each "
              "sequence according to context_length, context_stride and "
@@ -120,17 +122,17 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput(
         "Filter",
         "(Tensor) the input(Filter) is an learnable parameter."
-        "This is a tensor with shape (N, D), where N is the "
-        "context_length * input_hidden_size, D is the output feature size.");
+        "This is a tensor with shape (K, M), where K is the "
+        "context_length * input_hidden_size, M is the output feature size.");
     AddOutput(
         "Out",
         "(LoDTensor) the output(Out) is a LodTensor, which support "
         "variable-time length output sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, D), where, T is the "
-        "total time steps in this mini-batch, D is the output feature size.");
+        "this LoDTensor is a matrix with shape (T, M), where, T is the "
+        "total time steps in this mini-batch, M is the output feature size.");
 
     AddAttr<bool>("paddingTrainable",
-                  "(bool, default false) the padding data of SequenceConvOp "
+                  "(bool, default:false) the padding data of SequenceConvOp "
                   "is trainable or not.")
         .SetDefault(false);
     AddAttr<int>("contextLength",
@@ -138,7 +140,7 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
                  "height of the convolution kernel.")
         .GreaterThan(0);
     AddAttr<int>("contextStart",
-                 "(int, default 0) the contextStart of SequenceConvOp "
+                 "(int, default:0) the contextStart of SequenceConvOp "
                  "represents the beginning of the convolution of the number of "
                  "rows of sequence, which can be negative. The negative number "
                  "means to pad contextStart time-steps of zeros or learnable "
@@ -147,7 +149,7 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
                  "instance.")
         .SetDefault(0);
     AddAttr<int>("contextStride",
-                 "(int, default 1) the contextStride of SequenceConvOp "
+                 "(int, default:1) the contextStride of SequenceConvOp "
                  "represents the stride length of convolution kernel. "
                  "Currently, SequenceConvOp only supports"
                  "contextStride=1.")
@@ -156,7 +158,7 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
     SequenceConvOp performs convolution operation on features of
-    context_length time-steps of each instance.
+    contextLength time-steps of each instance.
     The convolution operation calculates the output based on the input, filter
     and strides, paddings parameters. The size of each dimension of the
     parameters is checked in the infer-shape. In order to ensure the equal
diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h
index 5727238c0d..a57e1752bb 100644
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
@@ -40,7 +40,6 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     int context_stride = context.Attr<int>("contextStride");
     bool padding_trainable = context.Attr<bool>("paddingTrainable");
 
-    // InferShape by in_lod
     PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
                       "Only support one level sequence now.");
 
@@ -51,20 +50,17 @@ class SequenceConvKernel : public framework::OpKernel<T> {
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
-    int sequence_width;
-    sequence_width = static_cast<int>(in->dims()[1]);
+    int sequence_width = static_cast<int>(in->dims()[1]);
 
-    // Use col_shape in the im2col calculation.
     framework::DDim col_shape = {in->dims()[0],
-                                 sequence_width * context_length};
+                                 context_length * sequence_width};
     Tensor col;
     col.mutable_data<T>(col_shape, context.GetPlace());
-    math::SetConstant<Place, T> set_zero;
     // Because if padding_trainable is false, padding data should be zeros.
+    math::SetConstant<Place, T> set_zero;
     set_zero(context.device_context(), &col, static_cast<T>(0));
 
-    paddle::operators::math::ContextProjectFunctor<Place, T>
-        seq_project_functor;
+    math::ContextProjectFunctor<Place, T> seq_project_functor;
 
     seq_project_functor(context.device_context(), *in, *padding_data, col,
                         padding_trainable, context_start, context_length,
@@ -79,8 +75,8 @@ template <typename Place, typename T>
 class SequenceConvGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* filter_g = context.Output<Tensor>(framework::GradVarName("Filter"));
     auto* padding_data_g =
         context.Output<Tensor>(framework::GradVarName("PaddingData"));
@@ -113,10 +109,8 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
       math::matmul<Place, T>(context.device_context(), *out_g, false, *filter,
                              true, T(1.0), &col, T(1.0));
     }
-    paddle::operators::math::ContextProjectFunctor<Place, T>
-        seq_project_functor;
-    paddle::operators::math::ContextProjectGradFunctor<Place, T>
-        seq_project_grad_functor;
+    math::ContextProjectFunctor<Place, T> seq_project_functor;
+    math::ContextProjectGradFunctor<Place, T> seq_project_grad_functor;
 
     if (in_g) {
       in_g->mutable_data<T>(context.GetPlace());

From 73d785572697f0cc0ebb03791048001dd52174d1 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 30 Oct 2017 10:11:30 -0700
Subject: [PATCH 65/81] Fix a type error top_k_op (#5201)

* Fix Type error

* Fix error
---
 paddle/operators/top_k_op.h                       | 4 ++--
 python/paddle/v2/framework/tests/test_top_k_op.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h
index 4b248faa12..bc8563717a 100644
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
@@ -40,7 +40,7 @@ class TopkKernel : public framework::OpKernel<T> {
     const size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    T* indices_data = indices->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
 
     auto eg_input = EigenMatrix<T>::From(*input);
 
@@ -66,7 +66,7 @@ class TopkKernel : public framework::OpKernel<T> {
           });
       for (size_t j = 0; j < k; j++) {
         output_data[i * k + j] = vec[j].first;
-        indices_data[i * k + j] = vec[j].second;
+        indices_data[i * k + j] = int64_t(vec[j].second);
       }
     }
   }
diff --git a/python/paddle/v2/framework/tests/test_top_k_op.py b/python/paddle/v2/framework/tests/test_top_k_op.py
index 694f37d612..6e8fbefa6e 100644
--- a/python/paddle/v2/framework/tests/test_top_k_op.py
+++ b/python/paddle/v2/framework/tests/test_top_k_op.py
@@ -9,7 +9,7 @@ class TestTopkOp(OpTest):
         k = 1
         input = np.random.random((32, 84)).astype("float32")
         output = np.ndarray((32, k))
-        indices = np.ndarray((32, k))
+        indices = np.ndarray((32, k)).astype("int64")
 
         self.inputs = {'X': input}
         self.attrs = {'k': k}
@@ -32,7 +32,7 @@ class TestTopkOp3d(OpTest):
         input = np.random.random((32, 2, 84)).astype("float32")
         input_flat_2d = input.reshape(64, 84)
         output = np.ndarray((64, k))
-        indices = np.ndarray((64, k)).astype("int")
+        indices = np.ndarray((64, k)).astype("int64")
 
         # FIXME: should use 'X': input for a 3d input
         self.inputs = {'X': input_flat_2d}

From 6c8dce9ce23103c50e639c2dd89e41b3fbd37aea Mon Sep 17 00:00:00 2001
From: Yi Wang <wangkuiyi@users.noreply.github.com>
Date: Mon, 30 Oct 2017 10:11:51 -0700
Subject: [PATCH 66/81] Contribute and logging (#5181)

* Create vlog_guide.md

* Move design/vlog_guide.md into CONTRIBUTE.md

* In response to comments from Yu Yang and Tony

* In response to comments from Luo Tao
---
 CONTRIBUTING.md                          | 163 ++++++++++++++++-
 doc/howto/dev/contribute_to_paddle_en.md | 219 -----------------------
 2 files changed, 162 insertions(+), 220 deletions(-)
 delete mode 100644 doc/howto/dev/contribute_to_paddle_en.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0d4bb973ae..f50be9de21 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1 +1,162 @@
-./doc/howto/dev/contribute_to_paddle_en.md
+# Contribute Code
+
+We sincerely appreciate your contribution.  This document explains our workflow and work style.
+
+## Workflow
+
+PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/).  The following steps guide usual contributions.
+
+1. Fork
+
+   Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo.  So, please file Pull Requests from your fork.  To make a fork,  just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/).
+
+1. Clone
+
+   To make a copy of your fork to your local computers, please run
+
+   ```bash
+   git clone https://github.com/your-github-account/paddle
+   cd paddle
+   ```
+
+1. Create the local feature branch
+
+   For daily works like adding a new feature or fixing a bug, please open your feature branch before coding:
+
+   ```bash
+   git checkout -b my-cool-stuff
+   ```
+
+1. Commit
+
+   Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands:
+
+   ```bash
+   pip install pre-commit
+   pre-commit install
+   ```
+
+   Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
+
+   Once installed, `pre-commit` checks the style of code and documentation in every commit.  We will see something like the following when you run `git commit`:
+
+   ```
+   ➜  git commit
+   CRLF end-lines remover...............................(no files to check)Skipped
+   yapf.................................................(no files to check)Skipped
+   Check for added large files..............................................Passed
+   Check for merge conflicts................................................Passed
+   Check for broken symlinks................................................Passed
+   Detect Private Key...................................(no files to check)Skipped
+   Fix End of Files.....................................(no files to check)Skipped
+   clang-formater.......................................(no files to check)Skipped
+   [my-cool-stuff c703c041] add test file
+    1 file changed, 0 insertions(+), 0 deletions(-)
+    create mode 100644 233
+   ```
+
+1. Build and test
+
+   Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
+
+1. Keep pulling
+
+   An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.
+
+   ```bash
+   git remote add upstream https://github.com/PaddlePaddle/Paddle
+   git pull upstream develop
+   ```
+
+1. Push and file a pull request
+
+   You can "push" your local work into your forked repo:
+
+   ```bash
+   git push origin my-cool-stuff
+   ```
+
+   The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one.
+
+   To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).
+
+   If your change is for fixing an issue, please write ["Fixes <issue-URL>"](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request.  Github would close the issue when the owners merge your pull request.
+
+   Please remember to specify some reviewers for your pull request.  If you don't know who are the right ones, please follow Github's recommendation.
+
+
+1. Delete local and remote branches
+
+   To keep your local workspace and your fork clean, you might want to remove merged branches:
+
+   ```bash
+   git push origin :my-cool-stuff
+   git checkout develop
+   git pull upstream develop
+   git branch -d my-cool-stuff
+   ```
+
+### Code Review
+
+-  Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email.  Please do this after your pull request passes the CI.
+
+- Please answer reviewers' every comment.  If you are to follow the comment, please write "Done"; please give a reason otherwise.
+
+- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).
+
+- Reduce the unnecessary commits.  Some developers commit often.  It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`.
+
+
+## Coding Standard
+
+### Code Style
+
+Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html).
+
+Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/).
+
+Our build process helps to check the code style.  In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default.  This flag is on
+
+Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`.  To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43).
+
+### Unit Tests
+
+Please remember to add related unit tests.
+
+- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md).
+
+- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).
+
+
+### Writing Logs
+
+We use [glog](https://github.com/google/glog) for logging in our C/C++ code.
+
+For general information, please use `LOG`.  For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose).  The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ).
+
+`VLOG` requires a *verbose level* parameter.  For example:
+
+```c++
+VLOG(3) << "Operator FC is taking " << num_inputs << "inputs."
+```
+
+When we run a PaddlePaddle application or test, we can specify a verbose threshold.  For example:
+
+```bash
+GLOG_vmodule=buddy_allocator=2 \
+GLOG_v=10 \
+python \
+../python/paddle/v2/framework/tests/test_recurrent_op.py
+```
+
+This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
+
+- verbose level 1:
+  - [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 3:
+  - [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 5:
+  - [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
+  - [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 7:
+  - [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
deleted file mode 100644
index 40d1eb62d7..0000000000
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ /dev/null
@@ -1,219 +0,0 @@
-# Contribute Code
-
-We sincerely appreciate your contributions. You can use fork and pull request
-workflow to merge your code.
-
-## Code Requirements
-- Your code comments must be fully documented by
-  [Doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
-- Make sure the compiler option `WITH_STYLE_CHECK` is on and the compiler
-  passes the code style check.
-- All code must have unit test.
-- Pass all unit tests.
-
-The following tutorial guides you into submitting your contibution.
-
-## [Creating a Fork](https://help.github.com/articles/fork-a-repo/)
-
-Just head over to the GitHub page and click the "Fork" button.
-It's just that simple.
-
-## Clone
-
-Clone remote repository.
-
-```bash
-➜  git clone https://github.com/USERNAME/Paddle
-➜  cd Paddle
-```
-
-## Create a local branch
-
-Paddle is currently using [Git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
-
-All feature and bug fix development work should be done on a new branch, generally create new branch from `develop` branch .
-
-```bash
-➜  git checkout -b my-cool-stuff
-```
-
-Before the checkout, you need to keep the current branch directory clean, otherwise the untracked file will be brought to the new branch, which can be inspected by `git status`.
-
-## Using `pre-commit` hook
-
-Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git
-pre-commit hooks. It can help us format source codes (cpp, python), check some
-basic thing before commit (only one EOL for each file, do not add a huge file
-in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every
-PR doesn't fit hook can not be merged into Paddle.
-
-To use [pre-commit](http://pre-commit.com/), you should install it by
-`pip install pre-commit`, and currently, Paddle uses `clang-format` to format
-c/cpp sources. Please make sure clang-format 3.8+ installed.
-
-Install and run it as follow:
-
-```bash
-➜  pip install pre-commit
-➜  pre-commit install
-```
-
-When you commit your code, the pre-commit hook will check the local code if there is
-anything not suitable to commit, and so on.
-
-## Start to develop
-
-In this tutorial, I delete a line in README.md and created a new file.
-
-We can use `git status` to inspect the changes of current directory, `git diff` to see difference.
-
-```bash
-➜  git status
-On branch test
-Changes not staged for commit:
-  (use "git add <file>..." to update what will be committed)
-  (use "git checkout -- <file>..." to discard changes in working directory)
-
-	modified:   README.md
-
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-
-	test
-
-no changes added to commit (use "git add" and/or "git commit -a")
-```
-## Build and Test
-
-We package PaddlePaddle's compile environment into a Docker image, called the develop image named `paddle:dev`, it contains all compiling tools that PaddlePaddle needs. 
-
-If you want to build the develop image, just run:
-
-```bash
-➜  docker build -t paddle:dev .
-```
-
-Then we can use the develop image to build PaddlePaddle source. For example:
-
-```bash
-➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
-```
-
-The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
-
-Then we can generate the production image by copying the compiled PaddlePaddle program into the image by
-
-```bash
-➜  docker build -t paddle:prod -f build/Dockerfile .
-```
-
-Run unit test finally:
-
-```bash
-➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-```
-
-For more details, you can read [this doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
-
-## Commit
-
-Next we cancel the changes to the README.md file and then commit our changes by following command lines:
-
-```bash
-➜  git checkout -- README.md
-➜  git status
-On branch test
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-
-	test
-
-nothing added to commit but untracked files present (use "git add" to track)
-➜  git add test
-```
-
-We should write a description of each commit by `git commit` to allow others to know
-the changes in these files.
-
-```bash
-➜  git commit
-CRLF end-lines remover...............................(no files to check)Skipped
-yapf.................................................(no files to check)Skipped
-Check for added large files..............................................Passed
-Check for merge conflicts................................................Passed
-Check for broken symlinks................................................Passed
-Detect Private Key...................................(no files to check)Skipped
-Fix End of Files.....................................(no files to check)Skipped
-clang-formater.......................................(no files to check)Skipped
-[my-cool-stuff c703c041] add test file
- 1 file changed, 0 insertions(+), 0 deletions(-)
- create mode 100644 233
-```
-
-## Keeping Fork Up to Date
-
-Before pull your request, you should sync your code from the latest PaddlePaddle.
-To do this, you'll need to add a remote at first:
-
-```bash
-➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
-➜  git remote
-origin
-upstream
-```
-
-Update your fork with the latest upstream changes:
-
-```bash
-➜  git fetch upstream
-➜  git pull upstream develop
-```
-
-Now, your local master branch is up-to-date with everything modified upstream.
-
-## Push to GitHub
-
-```bash
-# push to your repository in Github
-➜  git push origin my-cool-stuff
-```
-
-## Create an issue and a Pull Request
-
-Create an Issue to describe the problem and record its number.
-
-Go to the page for your fork on GitHub, select your development branch,
-and click the `New pull request`.
-
-<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
-
-Then select the target branch:
-
-<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
-
-We can add `resolve #Issue number` in PR description to close the issue automatically after the PR is merge. More details in <https://help.github.com/articles/closing-issues-via-commit-messages/>.
-
-Then wait for review, if there need to modify, refer to the above steps to update the corresponding origin branch.
-
-## Delete origin branch
-
-After the PR is merge into the main repository, we can delete the remote branch on the PR page.
-
-<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
-
-Or just run:
-
-```bash
-➜  git push origin :my-cool-stuff
-```
-
-## Delete local branch
-
-Finally, we delete local branch:
-
-```bash
-➜  git checkout develop 
-
-# delete my-cool-stuff branch
-➜  git branch -D my-cool-stuff
-```

From a186b53dfbc46963904f790077244a10ea1cb60d Mon Sep 17 00:00:00 2001
From: QI JUN <qijun1994@hotmail.com>
Date: Mon, 30 Oct 2017 10:37:44 -0700
Subject: [PATCH 67/81] add init_gflags interface (#5193)

* add init_gflags interface

* refine code

* follow comments
---
 paddle/pybind/pybind.cc                | 21 +++++++++++++++++++++
 python/paddle/v2/framework/__init__.py | 10 ++++++++++
 2 files changed, 31 insertions(+)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index bf6e122642..4baff895da 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/pybind/protobuf.h"
 
+#include <mutex>  // for call_once
+#include "gflags/gflags.h"
 #include "paddle/framework/backward.h"
 #include "paddle/framework/executor.h"
 #include "paddle/framework/feed_fetch_method.h"
@@ -45,6 +47,24 @@ static size_t UniqueIntegerGenerator() {
   return generator.fetch_add(1);
 }
 
+std::once_flag gflags_init_flag;
+
+// TODO(qijun) move init gflags to init.cc
+void InitGflags(std::vector<std::string> &argv) {
+  std::call_once(gflags_init_flag, [&]() {
+    int argc = argv.size();
+    char **arr = new char *[argv.size()];
+    std::string line;
+    for (size_t i = 0; i < argv.size(); i++) {
+      arr[i] = &argv[i][0];
+      line += argv[i];
+      line += ' ';
+    }
+    google::ParseCommandLineFlags(&argc, &arr, true);
+    VLOG(1) << "Init commandline: " << line;
+  });
+}
+
 bool IsCompileGPU() {
 #ifndef PADDLE_WITH_CUDA
   return false;
@@ -483,6 +503,7 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   m.def("unique_integer", UniqueIntegerGenerator);
+  m.def("init_gflags", InitGflags);
 
   m.def("is_compile_gpu", IsCompileGPU);
   m.def("set_feed_variable", framework::SetFeedVariable);
diff --git a/python/paddle/v2/framework/__init__.py b/python/paddle/v2/framework/__init__.py
index c942373c66..5df612bf35 100644
--- a/python/paddle/v2/framework/__init__.py
+++ b/python/paddle/v2/framework/__init__.py
@@ -1 +1,11 @@
+import sys
+import core
 __all__ = ['proto']
+argv = []
+if core.is_compile_gpu():
+    argv = list(sys.argv) + [
+        "--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"
+    ]
+else:
+    argv = list(sys.argv) + ["--tryfromenv=use_pinned_memory"]
+core.init_gflags(argv)

From 8f4c488e6e2fa88438142fce1ef504521c2fd18e Mon Sep 17 00:00:00 2001
From: Thuan Nguyen <cs2be@yahoo.com>
Date: Mon, 30 Oct 2017 11:45:50 -0700
Subject: [PATCH 68/81] * Add symbolic link from Paddle/CONTRIBUTING.md to
 doc/howto/dev/contribute_to_paddle_en.md so sphinx can generate the document
 * Update CONTRIBUTING.md links so sphinx does not add these links to the TOC
 * Removed dev/contribute_to_paddle_cn.md from documentation, since this
 document is not in sync with Paddle/CONTRIBUTING.md

---
 CONTRIBUTING.md                          | 13 ++++---------
 doc/howto/dev/contribute_to_paddle_en.md |  1 +
 doc/howto/index_cn.rst                   |  1 -
 3 files changed, 5 insertions(+), 10 deletions(-)
 create mode 120000 doc/howto/dev/contribute_to_paddle_en.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f50be9de21..a60453ff4e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -151,12 +151,7 @@ python \
 
 This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
 
-- verbose level 1:
-  - [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
-- verbose level 3:
-  - [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
-- verbose level 5:
-  - [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
-  - [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
-- verbose level 7:
-  - [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000..c97564d93a
--- /dev/null
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 0608aa3096..76d3e0a009 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -21,7 +21,6 @@
 
   dev/build_cn.rst
   dev/write_docs_cn.rst
-  dev/contribute_to_paddle_cn.md
 
 模型配置
 --------

From cdc700bb3283cf3e8ce8ff83f2292d0a98e96a99 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 31 Oct 2017 03:23:29 +0800
Subject: [PATCH 69/81] add resnet (#5206)

* add resnet

* optimize code
---
 python/paddle/v2/framework/layers.py          |   5 +-
 .../tests/test_image_classification_layer.py  |  23 ++++
 .../tests/test_image_classification_train.py  | 130 +++++++++++++++++-
 3 files changed, 152 insertions(+), 6 deletions(-)

diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 041a3b2c0b..0212afec9d 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -5,7 +5,7 @@ import re
 
 __all__ = [
     'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
-    'StaticRNN', 'cast'
+    'StaticRNN', 'cast', 'batch_norm'
 ]
 
 
@@ -150,7 +150,7 @@ def _create_op_func_(op_type):
             outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
-        return out
+        return helper.append_activation(out)
 
     func.__name__ = op_type
     globals()[op_type] = func
@@ -160,6 +160,7 @@ def _create_op_func_(op_type):
 
 _create_op_func_('mean')
 _create_op_func_('mul')
+_create_op_func_('elementwise_add')
 _create_op_func_('dropout')
 _create_op_func_('reshape')
 
diff --git a/python/paddle/v2/framework/tests/test_image_classification_layer.py b/python/paddle/v2/framework/tests/test_image_classification_layer.py
index 908cf44b88..7411689b61 100644
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
@@ -70,6 +70,29 @@ class TestLayer(unittest.TestCase):
 
         # print str(program)
 
+    def test_elementwise_add_with_act(self):
+        program = Program()
+        init_program = Program()
+        image1 = layers.data(
+            name='pixel1',
+            shape=[3, 48, 48],
+            data_type='float32',
+            program=program,
+            init_program=init_program)
+        image2 = layers.data(
+            name='pixel2',
+            shape=[3, 48, 48],
+            data_type='float32',
+            program=program,
+            init_program=init_program)
+        out = layers.elementwise_add(
+            x=image1,
+            y=image2,
+            act='relu',
+            program=program,
+            init_program=init_program)
+        # print(program)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_image_classification_train.py b/python/paddle/v2/framework/tests/test_image_classification_train.py
index 4eb9051261..6b6dec4976 100644
--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_train.py
@@ -10,6 +10,120 @@ from paddle.v2.framework.executor import Executor
 import numpy as np
 
 
+def resnet_cifar10(input, depth=32, program=None, init_program=None):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      program=None,
+                      init_program=None):
+        tmp = layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=False,
+            program=program,
+            init_program=init_program)
+        return layers.batch_norm(
+            input=tmp, act=act, program=program, init_program=init_program)
+
+    def shortcut(input, ch_in, ch_out, stride, program, init_program):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None, program,
+                                 init_program)
+        else:
+            return input
+
+    def basicblock(input,
+                   ch_in,
+                   ch_out,
+                   stride,
+                   program=program,
+                   init_program=init_program):
+        tmp = conv_bn_layer(
+            input,
+            ch_out,
+            3,
+            stride,
+            1,
+            program=program,
+            init_program=init_program)
+        tmp = conv_bn_layer(
+            tmp,
+            ch_out,
+            3,
+            1,
+            1,
+            act=None,
+            program=program,
+            init_program=init_program)
+        short = shortcut(input, ch_in, ch_out, stride, program, init_program)
+        return layers.elementwise_add(
+            x=tmp,
+            y=short,
+            act='relu',
+            program=program,
+            init_program=init_program)
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride, program,
+                   init_program):
+        tmp = block_func(input, ch_in, ch_out, stride, program, init_program)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1, program, init_program)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    conv1 = conv_bn_layer(
+        input=input,
+        ch_out=16,
+        filter_size=3,
+        stride=1,
+        padding=1,
+        program=program,
+        init_program=init_program)
+    res1 = layer_warp(
+        basicblock,
+        conv1,
+        16,
+        16,
+        n,
+        1,
+        program=program,
+        init_program=init_program)
+    res2 = layer_warp(
+        basicblock,
+        res1,
+        16,
+        32,
+        n,
+        2,
+        program=program,
+        init_program=init_program)
+    res3 = layer_warp(
+        basicblock,
+        res2,
+        32,
+        64,
+        n,
+        2,
+        program=program,
+        init_program=init_program)
+    pool = layers.pool2d(
+        input=res3,
+        pool_size=8,
+        pool_type='avg',
+        pool_stride=1,
+        program=program,
+        init_program=init_program)
+    return pool
+
+
 def vgg16_bn_drop(input, program, init_program):
     def conv_block(input,
                    num_filter,
@@ -75,8 +189,16 @@ label = layers.data(
     data_type='int64',
     program=program,
     init_program=init_program)
-vgg_net = vgg16_bn_drop(images, program, init_program)
-predict = layers.fc(input=vgg_net,
+
+# Add neural network config
+# option 1. resnet
+net = resnet_cifar10(images, 32, program, init_program)
+# option 2. vgg
+# net = vgg16_bn_drop(images, program, init_program)
+
+# print(program)
+
+predict = layers.fc(input=net,
                     size=classdim,
                     act='softmax',
                     program=program,
@@ -123,8 +245,8 @@ for pass_id in range(PASS_NUM):
                        fetch_list=[avg_cost])
 
         loss = np.array(outs[0])
-        # print("pass_id:" + str(pass_id) + " batch_id:" + str(batch_id) +
-        #       " loss:" + str(loss))
+        print("pass_id:" + str(pass_id) + " batch_id:" + str(batch_id) +
+              " loss:" + str(loss))
         batch_id = batch_id + 1
 
         if batch_id > 1:

From 2b1f21a59b8dbb3597061adb30ca531fd82cf76b Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 30 Oct 2017 13:54:16 -0700
Subject: [PATCH 70/81] Fix MacOS Compile (#5217)

---
 paddle/operators/seq_expand_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index aa91e0f929..8703105385 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -48,7 +48,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
           x_t(x_data, 1, element_len);
       Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
           out_t(out_data, scale, element_len);
-      Eigen::array<int, 2> cast({scale, 1});
+      Eigen::array<int, 2> cast({{scale, 1}});
       out_t.device(place) = x_t.broadcast(cast);
       x_data += element_len;
       out_data += element_len * scale;

From d3cc7ac3047211d2a8dad72e471f62a87e0171cc Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 30 Oct 2017 14:31:10 -0700
Subject: [PATCH 71/81] Fix top k op GPU code (#5221)

* Fix Type error

* Fix error

* Fix top_k_op GPU code data type
---
 paddle/operators/top_k_op.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
index 7be6932f1e..7851c71bbe 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -23,9 +23,9 @@ using Tensor = framework::Tensor;
 template <typename T>
 struct Pair {
   __device__ __forceinline__ Pair() {}
-  __device__ __forceinline__ Pair(T value, int id) : v(value), id(id) {}
+  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
 
-  __device__ __forceinline__ void set(T value, int id) {
+  __device__ __forceinline__ void set(T value, int64_t id) {
     v = value;
     id = id;
   }
@@ -48,7 +48,7 @@ struct Pair {
   }
 
   T v;
-  int id;
+  int64_t id;
 };
 
 template <typename T>
@@ -197,7 +197,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
 template <typename T, int MaxLength, int BlockSize>
 __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
                                             Pair<T> topk[], T** topVal,
-                                            int** topIds, int& beam, int& k,
+                                            int64_t** topIds, int& beam, int& k,
                                             const int tid, const int warp) {
   while (true) {
     __syncthreads();
@@ -249,7 +249,7 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
  * 4. go to the first setp, until get the topk value.
  */
 template <typename T, int MaxLength, int BlockSize>
-__global__ void KeMatrixTopK(T* output, int output_stride, int* indices,
+__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
                              const T* src, int lds, int dim, int k) {
   __shared__ Pair<T> sh_topk[BlockSize];
   __shared__ int maxid[BlockSize / 2];
@@ -293,7 +293,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     // FIXME(typhoonzero): data is always converted to type T?
-    int* indices_data = indices->mutable_data<int>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
 
     size_t input_height = input->dims()[0];
     size_t input_width = input->dims()[1];

From f4710cf0e210f65357b0c9ebc871602addac4131 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Mon, 30 Oct 2017 14:45:57 -0700
Subject: [PATCH 72/81] "add sequence conv layer" (#5117)

* "add sequence conv layer"

* "add sequence layer"

* add networks

* "fix based comment"

* Update layers.py
---
 python/paddle/v2/framework/layers.py | 85 +++++++++++++++++++++++++++-
 python/paddle/v2/framework/nets.py   | 30 +++++++++-
 2 files changed, 112 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 0212afec9d..57723c4d5a 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -5,7 +5,7 @@ import re
 
 __all__ = [
     'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
-    'StaticRNN', 'cast', 'batch_norm'
+    'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool'
 ]
 
 
@@ -165,6 +165,18 @@ _create_op_func_('dropout')
 _create_op_func_('reshape')
 
 
+def cast(x, data_type, program=None):
+    helper = LayerHelper('cast', **locals())
+    out = helper.create_tmp_variable(dtype=data_type)
+    helper.append_op(
+        type='cast',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'in_data_type': x.data_type,
+               'out_data_type': out.data_type})
+    return out
+
+
 def cast(x, data_type, program=None):
     helper = LayerHelper('cast', **locals())
     out = helper.create_tmp_variable(dtype=data_type)
@@ -220,6 +232,46 @@ def square_error_cost(input, label, **kwargs):
     return square_out
 
 
+def sequence_conv(input,
+                  num_filters,
+                  name=None,
+                  filter_size=3,
+                  act=None,
+                  stride=1,
+                  padding=None,
+                  bias_attr=None,
+                  param_attr=None,
+                  program=None,
+                  init_program=None):
+    # FIXME(dzh) : want to unify the argument of python layer
+    # function. So we ignore some unecessary attributes.
+    # such as, padding_trainable, context_start.
+
+    helper = LayerHelper('sequence_conv', **locals())
+    dtype = helper.input_dtype()
+
+    filter_shape = [num_filters, filter_size]
+    filter = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='sequence_conv',
+        inputs={
+            'X': [input],
+            'Filter': filter,
+        },
+        outputs={"Out": pre_bias},
+        attrs={
+            'context_stride': stride,
+            'context_start': 0,
+            'context_length': filter_size
+        })
+
+    pre_act = helper.append_bias_op(pre_bias)
+    return helper.append_activation(pre_act)
+
+
 def conv2d(input,
            num_filters,
            name=None,
@@ -272,6 +324,35 @@ def conv2d(input,
     return helper.append_activation(pre_act)
 
 
+def sequence_pool(input,
+                  pool_size,
+                  pool_type,
+                  pool_stride=1,
+                  pool_padding=0,
+                  global_pooling=False,
+                  program=None,
+                  init_program=None):
+    # FIXME(dzh) : want to unify the argument of python layer
+    # function. So we ignore some unecessary attributes
+
+    ENUM_POOL_TYPE = set(["max", "avg", "sqrt", "last", "first"])
+    if pool_type not in ENUM_POOL_TYPE:
+        raise ValueError("Unknown pool_type: '%s'. It can only be %s.",
+                         str(pool_type), " ".join(ENUM_POOL_TYPE))
+
+    helper = LayerHelper('sequence_pool', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="sequence_pool",
+        inputs={"X": [input]},
+        outputs={"Out": pool_out},
+        attrs={"strategy": pool_type})
+
+    return pool_out
+
+
 def pool2d(input,
            pool_size,
            pool_type,
@@ -291,7 +372,7 @@ def pool2d(input,
     if isinstance(pool_padding, int):
         pool_padding = [pool_padding, pool_padding]
 
-    helper = LayerHelper('conv2d', **locals())
+    helper = LayerHelper('pool2d', **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_tmp_variable(dtype)
 
diff --git a/python/paddle/v2/framework/nets.py b/python/paddle/v2/framework/nets.py
index 803534fa39..a9998073e1 100644
--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
@@ -1,9 +1,11 @@
 import paddle.v2.framework.layers as layers
 
+__all__ = ["simple_img_conv_pool", "sequence_conv_pool"]
+
 
 def simple_img_conv_pool(input,
-                         filter_size,
                          num_filters,
+                         filter_size,
                          pool_size,
                          pool_stride,
                          act,
@@ -94,3 +96,29 @@ def img_conv_group(input,
         program=program,
         init_program=init_program)
     return pool_out
+
+
+def sequence_conv_pool(input,
+                       num_filters,
+                       filter_size,
+                       pool_size,
+                       pool_stride,
+                       act,
+                       program=None,
+                       init_program=None):
+    conv_out = layers.sequence_conv(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        act=act,
+        program=program,
+        init_program=init_program)
+
+    pool_out = layers.sequence_pool(
+        input=conv_out,
+        pool_size=pool_size,
+        pool_type='max',
+        pool_stride=pool_stride,
+        program=program,
+        init_program=init_program)
+    return pool_out

From 8d1ad97b3d7d2985c47b3cd27989803746feb3e2 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 30 Oct 2017 19:32:23 -0500
Subject: [PATCH 73/81] Add log to `InitParam` `GetParameter` `SendGrad` and
 etc. (#5162)

* add logs and fix a bug

* fix break buf

* modify path bugs

* fix by comments

* fix by comments

* add batch

* add float32tostring

* add pb support

* moidfy gotpaht

* compile ok

* add proto

* delete not need

* add proto

* add empty proto

* clean not need

* clean not need

* modify deps

* fix by comments and update depend

* fix compile error

* fix loop bugs
---
 go/.gitignore               |  1 +
 go/glide.lock               |  4 +--
 go/glide.yaml               |  1 +
 go/proto/.gitignore         |  4 +++
 go/pserver/CMakeLists.txt   |  2 +-
 go/pserver/service.go       | 60 ++++++++++++++++++++++++++++++++++---
 go/pserver/service_test.go  | 31 +++++++++++++++++++
 proto/CMakeLists.txt        | 27 +++++++++++++++++
 python/paddle/v2/trainer.py |  3 +-
 9 files changed, 125 insertions(+), 8 deletions(-)
 create mode 100644 go/proto/.gitignore

diff --git a/go/.gitignore b/go/.gitignore
index 000e1fd55b..398d70ca37 100644
--- a/go/.gitignore
+++ b/go/.gitignore
@@ -1,2 +1,3 @@
 vendor/
 .glide/
+proto/*.go
diff --git a/go/glide.lock b/go/glide.lock
index ce654d3636..d15fc934db 100644
--- a/go/glide.lock
+++ b/go/glide.lock
@@ -1,5 +1,5 @@
-hash: 51d9e2e46d7fd9173ff11ecada40f7b7728756be18d5e2f032535f66465e6e15
-updated: 2017-10-24T15:04:09.987751592-07:00
+hash: 107c058cf5c9163a75d40eef2273a793c36112683c25d72aa8288827fdde3a19
+updated: 2017-10-30T03:46:19.137696069Z
 imports:
 - name: github.com/alecthomas/gometalinter
   version: bae2f1293d092fd8167939d5108d1b025eaef9de
diff --git a/go/glide.yaml b/go/glide.yaml
index ba253f8beb..c5d66694ac 100644
--- a/go/glide.yaml
+++ b/go/glide.yaml
@@ -30,3 +30,4 @@ import:
   version: v2.13
 - package: github.com/go-stack/stack
   version: v1.6.0
+- package: github.com/golang/protobuf
diff --git a/go/proto/.gitignore b/go/proto/.gitignore
new file mode 100644
index 0000000000..5e7d2734cf
--- /dev/null
+++ b/go/proto/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/go/pserver/CMakeLists.txt b/go/pserver/CMakeLists.txt
index 4fe0a8cb02..9ac05199e7 100644
--- a/go/pserver/CMakeLists.txt
+++ b/go/pserver/CMakeLists.txt
@@ -13,5 +13,5 @@
 # limitations under the License.
 #
 if(WITH_TESTING)
-  go_test(pserver_test DEPS paddle_go_optimizer)
+  go_test(pserver_test DEPS paddle_go_optimizer gen_proto_go)
 endif()
diff --git a/go/pserver/service.go b/go/pserver/service.go
index f703d99a29..7484ec90b1 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -17,6 +17,7 @@ package pserver
 import (
 	"bufio"
 	"bytes"
+	"encoding/binary"
 	"encoding/gob"
 	"encoding/json"
 	"errors"
@@ -26,11 +27,15 @@ import (
 	"os"
 	"path"
 	"strconv"
+	"strings"
 	"sync"
 	"time"
 
+	"github.com/golang/protobuf/proto"
 	uuid "github.com/satori/go.uuid"
 
+	pb "github.com/PaddlePaddle/Paddle/go/proto"
+
 	log "github.com/inconshreveable/log15"
 )
 
@@ -65,6 +70,46 @@ type Parameter struct {
 	Content     []byte
 }
 
+func float32ToString(b []byte) string {
+	f := make([]float32, len(b)/4)
+	buf := bytes.NewReader(b)
+	err := binary.Read(buf, binary.LittleEndian, &f)
+	if err != nil {
+		return ""
+	}
+	return fmt.Sprintf("%v", f)
+}
+
+func float32ByteToString(c []byte) string {
+	var a []byte
+	var b []byte
+	if len(c) <= 80 {
+		a = c
+	} else {
+		a = c[0:40]
+		b = c[len(c)-40:]
+	}
+
+	var s string
+	s = float32ToString(a)
+
+	if b == nil {
+		return s
+	}
+
+	s = strings.Replace(s, "]", "", -1) + "..." + strings.Replace(float32ToString(b), "[", "", -1)
+	return s
+}
+
+func (p Parameter) String() string {
+	if p.ElementType != Float32 {
+		return fmt.Sprintf("name:%v ElementType:%v",
+			p.Name, p.ElementType)
+	}
+
+	return float32ByteToString(p.Content)
+}
+
 // ParameterWithConfig contains the parameter and the configuration.
 type ParameterWithConfig struct {
 	Param  Parameter
@@ -189,7 +234,9 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error
 	default:
 	}
 
-	// TODO(helin): parse parameter config
+	c := &pb.OptimizerConfig{}
+	proto.Unmarshal(paramWithConfigs.Config, c)
+	log.Debug(fmt.Sprintf("OptimizerConfig:%v", c))
 
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -239,7 +286,8 @@ func (s *Service) SendGrad(g Gradient, _ *int) error {
 	select {
 	case <-s.initialized:
 	default:
-		log.Warn("received gradient before initialization.", "name", g.Name, "size", len(g.Content), "type", g.ElementType)
+		log.Warn("received gradient before initialization.",
+			"name", g.Name, "size", len(g.Content), "type", g.ElementType)
 		return errors.New(Uninitialized)
 	}
 
@@ -248,10 +296,14 @@ func (s *Service) SendGrad(g Gradient, _ *int) error {
 
 	o, ok := s.optMap[g.Name]
 	if !ok {
+		log.Warn("received gradient but can't find name.",
+			"name", g.Name, "size", len(g.Content), "type", g.ElementType)
 		return fmt.Errorf("parameter: %s does not exist", g.Name)
 	}
 
-	log.Info("received gradient from trainer, updating gradient.", "name", g.Name, "size", len(g.Content), "type", g.ElementType)
+	log.Debug(Parameter(g).String())
+	log.Info("received gradient from trainer, updating gradient.",
+		"name", g.Name, "size", len(g.Content), "type", g.ElementType)
 	return o.UpdateParameter(g)
 }
 
@@ -277,7 +329,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()
-
+	log.Debug(parameter.String())
 	log.Info("sending parameter to the trainer", "name", parameter.Name, "size", len(parameter.Content), "type", parameter.ElementType)
 	return nil
 }
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index b6f4566eb7..58a743e1fa 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -15,6 +15,7 @@
 package pserver_test
 
 import (
+	"fmt"
 	"io/ioutil"
 	"reflect"
 	"sync"
@@ -178,3 +179,33 @@ func TestBlockUntilInitialized(t *testing.T) {
 
 	wg.Wait()
 }
+
+func TestGradientString(t *testing.T) {
+	g := pserver.Parameter{}
+	g.ElementType = pserver.Float32
+	g.Content = []byte{0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40, 0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40}
+	if g.String() != "[3.3702806e+12 2.142699 3.3702806e+12 2.142699]" {
+		t.Fatal("get float data error!")
+	}
+
+	g.Content = []byte{0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40}
+	if g.String() != "[3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699...3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699]" {
+		t.Fatal("get float data error!", g.String())
+	}
+	fmt.Println(g)
+}
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 5d898d860c..556bcd1d7e 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -27,3 +27,30 @@ foreach(filename ${proto_filenames})
 endforeach()
 
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
+
+
+if (WITH_GOLANG)
+    add_custom_target(protoc-gen-go)
+    add_custom_command(TARGET protoc-gen-go
+            COMMAND go 
+            ARGS "get" "-u" "github.com/golang/protobuf/protoc-gen-go")
+
+    set(PROTO_GEN_GO)
+    file(GLOB proto_filenames . OptimizerConfig.proto)
+    foreach(filename ${proto_filenames})
+        message(STATUS ${filename})
+        get_filename_component(ABS_FIL ${filename} ABSOLUTE)
+        get_filename_component(FIL_WE ${filename} NAME_WE)
+        set(CUR_PROTO_GEN_GO
+                ${PADDLE_SOURCE_DIR}/paddle/go/proto/${FIL_WE}.pb.go)
+        set(PROTO_GEN_GO
+                ${CUR_PROTO_GEN_GO}
+                ${PROTO_GEN_GO})
+        add_custom_command(OUTPUT ${CUR_PROTO_GEN_GO}
+                COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+                ARGS "--go_out=${PADDLE_SOURCE_DIR}/go/proto"
+                "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
+                DEPENDS ${ABS_FIL} protoc protoc-gen-go)
+    endforeach()
+    add_custom_target(gen_proto_go ALL DEPENDS ${PROTO_GEN_GO})
+endif()
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index b68fd0d5a9..db01ab7374 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -205,7 +205,8 @@ class SGD(object):
         """
         Testing method. Will test input data.
 
-        :param reader: A reader that reads and yeilds data items.
+        :param reader: A batch reader that reads and yeilds data items,
+                       it should be a paddle.v2.batch.
         :type reader: collections.Iterable
         :param feeding: Feeding is a map of neural network input name and array
                         index that reader returns.

From a128eb7b737941ac5e18fe42d4d8124a5c0cee71 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 31 Oct 2017 08:44:00 +0800
Subject: [PATCH 74/81] improve unique_name, uniq id is related to prefix
 (#5223)

* improve unique_name, uniq id is related to prefix

* fix join
---
 paddle/pybind/pybind.cc                                    | 7 ++++---
 python/paddle/v2/framework/framework.py                    | 5 +++--
 python/paddle/v2/framework/layer_helper.py                 | 2 +-
 .../v2/framework/tests/test_image_classification_layer.py  | 4 ++--
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 4baff895da..2a0075356e 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/pybind/protobuf.h"
 
 #include <mutex>  // for call_once
+#include <unordered_map>
 #include "gflags/gflags.h"
 #include "paddle/framework/backward.h"
 #include "paddle/framework/executor.h"
@@ -42,9 +43,9 @@ limitations under the License. */
 
 namespace paddle {
 namespace pybind {
-static size_t UniqueIntegerGenerator() {
-  static std::atomic<size_t> generator;
-  return generator.fetch_add(1);
+static size_t UniqueIntegerGenerator(const std::string &prefix) {
+  static std::unordered_map<std::string, std::atomic<size_t>> generators;
+  return generators[prefix].fetch_add(1);
 }
 
 std::once_flag gflags_init_flag;
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py
index 43101c9dda..f8d2f67410 100644
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
@@ -119,8 +119,9 @@ class Variable(object):
 
     @staticmethod
     def _unique_var_name_():
-        uid = core.unique_integer()  # unique during whole process.
-        return "_generated_var_%d" % uid
+        prefix = "_generated_var"
+        uid = core.unique_integer(prefix)  # unique during whole process.
+        return "_".join([prefix, str(uid)])
 
     @staticmethod
     def _convert_np_dtype_to_dtype_(np_dtype):
diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/framework/layer_helper.py
index 1f72c9bc7b..d96dbe172c 100644
--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
@@ -8,7 +8,7 @@ from paddle.v2.framework.framework import Variable, g_program, \
 
 
 def unique_name(prefix):
-    uid = core.unique_integer()  # unique during whole process.
+    uid = core.unique_integer(prefix)  # unique during whole process.
     return "_".join([prefix, str(uid)])
 
 
diff --git a/python/paddle/v2/framework/tests/test_image_classification_layer.py b/python/paddle/v2/framework/tests/test_image_classification_layer.py
index 7411689b61..b4eda13552 100644
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
@@ -37,7 +37,7 @@ class TestLayer(unittest.TestCase):
         layers.batch_norm(
             input=images, program=program, init_program=init_program)
 
-        #print str(program)
+        # print str(program)
 
     def test_dropout_layer(self):
         program = Program()
@@ -53,7 +53,7 @@ class TestLayer(unittest.TestCase):
             program=program,
             init_program=init_program)
 
-        #print str(program)
+        # print str(program)
 
     def test_img_conv_group(self):
         program = Program()

From afd1e844fdc85b6cfb0e44a34b73ba4de8affbc6 Mon Sep 17 00:00:00 2001
From: QI JUN <qijun1994@hotmail.com>
Date: Mon, 30 Oct 2017 17:45:38 -0700
Subject: [PATCH 75/81] remove unused code (#5219)

* remove unused code

* fix cmake file

* fix build error
---
 paddle/platform/CMakeLists.txt      |  1 -
 paddle/platform/environment.h       | 60 -----------------------------
 paddle/platform/environment_test.cc | 54 --------------------------
 paddle/platform/gpu_info.cc         |  8 ----
 4 files changed, 123 deletions(-)
 delete mode 100644 paddle/platform/environment.h
 delete mode 100644 paddle/platform/environment_test.cc

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index eb850b6585..bd86a9fe26 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -9,7 +9,6 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 add_subdirectory(dynload)
 
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)
-cc_test(environment_test SRCS environment_test.cc DEPS stringpiece)
 
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
diff --git a/paddle/platform/environment.h b/paddle/platform/environment.h
deleted file mode 100644
index 4edcce932e..0000000000
--- a/paddle/platform/environment.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <vector>
-
-#include "paddle/platform/enforce.h"
-#include "paddle/string/piece.h"
-
-extern char** environ;  // for environment variables
-
-namespace paddle {
-namespace platform {
-
-inline void SetEnvVariable(const std::string& name, const std::string& value) {
-  PADDLE_ENFORCE_NE(setenv(name.c_str(), value.c_str(), 1), -1,
-                    "Failed to set environment variable %s=%s", name, value);
-}
-
-inline void UnsetEnvVariable(const std::string& name) {
-  PADDLE_ENFORCE_NE(unsetenv(name.c_str()), -1,
-                    "Failed to unset environment variable %s", name);
-}
-
-inline bool IsEnvVarDefined(const std::string& name) {
-  return std::getenv(name.c_str()) != nullptr;
-}
-
-inline std::string GetEnvValue(const std::string& name) {
-  PADDLE_ENFORCE(IsEnvVarDefined(name),
-                 "Tried to access undefined environment variable %s", name);
-  return std::getenv(name.c_str());
-}
-
-inline std::vector<std::string> GetAllEnvVariables() {
-  std::vector<std::string> vars;
-  for (auto var = environ; *var != nullptr; ++var) {
-    auto tail = string::Index(*var, "=");
-    auto name = string::SubStr(*var, 0, tail).ToString();
-    vars.push_back(name);
-  }
-  return vars;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/environment_test.cc b/paddle/platform/environment_test.cc
deleted file mode 100644
index 5f13652721..0000000000
--- a/paddle/platform/environment_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/environment.h"
-
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-TEST(ENVIRONMENT, ACCESS) {
-  namespace platform = paddle::platform;
-  namespace string = paddle::string;
-
-  platform::SetEnvVariable("PADDLE_USE_ENV", "TRUE");
-
-  EXPECT_TRUE(platform::IsEnvVarDefined("PADDLE_USE_ENV"));
-  EXPECT_EQ(platform::GetEnvValue("PADDLE_USE_ENV"), "TRUE");
-
-  platform::UnsetEnvVariable("PADDLE_USE_ENV");
-  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV"));
-
-  platform::SetEnvVariable("PADDLE_USE_ENV1", "Hello ");
-  platform::SetEnvVariable("PADDLE_USE_ENV2", "World, ");
-  platform::SetEnvVariable("PADDLE_USE_ENV3", "PaddlePaddle!");
-
-  std::string env_info;
-  auto vars = platform::GetAllEnvVariables();
-  for_each(vars.begin(), vars.end(), [&](const std::string& var) {
-    env_info += platform::GetEnvValue(var);
-  });
-
-  EXPECT_TRUE(string::Contains(env_info, "Hello World, PaddlePaddle!"));
-  platform::UnsetEnvVariable("PADDLE_USE_ENV1");
-  platform::UnsetEnvVariable("PADDLE_USE_ENV2");
-  platform::UnsetEnvVariable("PADDLE_USE_ENV3");
-
-  env_info.clear();
-  vars = platform::GetAllEnvVariables();
-  for_each(vars.begin(), vars.end(), [&](const std::string& var) {
-    env_info += platform::GetEnvValue(var);
-  });
-
-  EXPECT_FALSE(string::Contains(env_info, "Hello World, PaddlePaddle!"));
-  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV1"));
-  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV2"));
-  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV3"));
-}
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 0cab5ffc56..f3455a8733 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "gflags/gflags.h"
 
 #include "paddle/platform/enforce.h"
-#include "paddle/platform/environment.h"
 
 DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
               "Default use 95% of GPU memory for PaddlePaddle,"
@@ -75,13 +74,6 @@ size_t GpuMaxChunkSize() {
 
   GpuMemoryUsage(available, total);
 
-  if (IsEnvVarDefined(kEnvFractionGpuMemoryToUse)) {
-    auto val = std::stod(GetEnvValue(kEnvFractionGpuMemoryToUse));
-    PADDLE_ENFORCE_GT(val, 0.0);
-    PADDLE_ENFORCE_LE(val, 1.0);
-    FLAGS_fraction_of_gpu_memory_to_use = val;
-  }
-
   // Reserving the rest memory for page tables, etc.
   size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
 

From 669786bfe14690b5c9ee5aed8c271b2cabf6f2c6 Mon Sep 17 00:00:00 2001
From: QI JUN <qijun1994@hotmail.com>
Date: Mon, 30 Oct 2017 17:49:08 -0700
Subject: [PATCH 76/81] refine square_error_cost layer (#5216)

* reimplement pow operator

* add pow_grad operator

* fix code style

* fix build error

* fix op_test bug

* revert pow operator

* add FIXME comment
---
 paddle/operators/activation_op.h            |  1 +
 python/paddle/v2/framework/layers.py        |  5 +----
 python/paddle/v2/framework/tests/op_test.py | 12 +++++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index e4c6b2e09c..ddd966e26c 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -547,6 +547,7 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+// FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
 template <typename T>
 struct PowFunctor : public BaseActivationFunctor<T> {
   float factor;
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 57723c4d5a..70447e0d81 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -225,10 +225,7 @@ def square_error_cost(input, label, **kwargs):
 
     square_out = helper.create_tmp_variable(dtype=input.data_type)
     helper.append_op(
-        type='pow',
-        inputs={'X': [minus_out]},
-        outputs={'Y': [square_out]},
-        attrs={'factor': 2.0})
+        type='square', inputs={'X': [minus_out]}, outputs={'Y': [square_out]})
     return square_out
 
 
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 50360e6e72..2e6710b5fc 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -281,7 +281,8 @@ class OpTest(unittest.TestCase):
                                          type(sub_out))
                 for sub_out_name, expect in sub_out:
                     idx = find_actual(sub_out_name, fetch_list)
-                    actual_t = np.array(outs[idx])
+                    actual = outs[idx]
+                    actual_t = np.array(actual)
                     expect_t = expect[0] \
                         if isinstance(expect, tuple) else expect
                     self.assertTrue(
@@ -291,11 +292,12 @@ class OpTest(unittest.TestCase):
                         str(place))
                     if isinstance(expect, tuple):
                         self.assertListEqual(
-                            actual_t.lod(), expect[1], "Output (" + sub_out_name
-                            + ") has different lod at " + str(place))
+                            actual.lod(), expect[1], "Output (" + sub_out_name +
+                            ") has different lod at " + str(place))
             else:
                 idx = find_actual(out_name, fetch_list)
-                actual_t = outs[idx]
+                actual = outs[idx]
+                actual_t = np.array(actual)
                 expect = self.outputs[out_name]
                 expect_t = expect[0] if isinstance(expect, tuple) else expect
                 self.assertTrue(
@@ -303,7 +305,7 @@ class OpTest(unittest.TestCase):
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place))
                 if isinstance(expect, tuple):
-                    self.assertListEqual(actual_t.lod(), expect[1],
+                    self.assertListEqual(actual.lod(), expect[1],
                                          "Output (" + out_name +
                                          ") has different lod at " + str(place))
 

From 8b1c50c642914f6ab1fb691059d6d88d9995bea1 Mon Sep 17 00:00:00 2001
From: Yi Wang <wangkuiyi@users.noreply.github.com>
Date: Mon, 30 Oct 2017 18:57:04 -0700
Subject: [PATCH 77/81] Update the Build PaddlePaddle for Raspberry Pi document
 (#5177)

* Add cross_compiling_for_raspberry.md

* Update cross_compiling for raspberry pi document

* Some minor edits

* In response to comments from Kavya

* Add the _en suffix
---
 .../cross_compiling_for_raspberry_cn.md       | 35 +++++------
 .../cross_compiling_for_raspberry_en.md       | 62 +++++++++++++++++++
 2 files changed, 78 insertions(+), 19 deletions(-)
 create mode 100644 doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md

diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
index 085b5dda16..026c0c6f3b 100644
--- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
@@ -1,39 +1,36 @@
 # 构建Raspberry Pi平台上的PaddlePaddle库
 
-对于Rasspberry Pi系统，用户可通过ssh等方式登录到Raspberry Pi系统上，按照[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档所述，直接编译Raspberry Pi平台上适用的PaddlePaddle库。
+通常有两个方法来构建基于 Rasspberry Pi 的版本：
 
-用户也可以在自己熟悉的开发平台上，通过交叉编译的方式来编译。这篇文档将以Linux x86-64平台为例，介绍交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。
+1. 通过ssh等方式登录到Raspberry Pi系统上来构建。所需的开发工具和第三方库可以参考 [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)。
 
-## 准备交叉编译环境
+1. 另一个方法是交叉编译。这篇文档介绍在 Linux/x64 上交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。
 
-从源码交叉编译PaddlePaddle，用户需要提前准备好交叉编译环境。用户可自行前往[github](https://github.com/raspberrypi/tools)下载Raspberry Pi平台使用的C/C++交叉编译工具链，也可通过以下命令获取：
+## 安装交叉编译器
+
+克隆下面 Github repo
 
 ```bash
 git clone https://github.com/raspberrypi/tools.git
 ```
 
-该github仓库中包含若干个预编译好的、针对不同平台的编译工具。宿主机是Linux x86-64环境，则需选用`arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`下的作为编译工具，所使用的编译器为arm-linux-gnueabihf-gcc 4.8.3。
-
-注意，该编译工具链需要系统glibc支持2.14以上。
+即可在 `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64` 目录里找到交叉编译器 arm-linux-gnueabihf-gcc 4.8.3。运行该编译工具链需要一台 Linux x64 机器上以及 2.14版本以上的 glibc。
 
 ## 配置交叉编译参数
 
-CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)，以提供一些默认的编译器和编译参数相关配置。
+CMake[支持交叉编译](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。PaddlePaddle for Raspberry Pi的配置信息在[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)。
 
 交叉编译Raspberry Pi版本PaddlePaddle库时，有一些必须配置的参数：
 
-- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后，PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本，并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。
-
-Raspberry Pi平台可选配置参数：
+- `CMAKE_SYSTEM_NAME`：CMake编译的目标平台，必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后，PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本，并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。
 
-- `RPI_TOOLCHAIN`，编译工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器；否则，用户需要在cmake时手动设置这些值。无默认值。
-- `RPI_ARM_NEON`，是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
+- `RPI_TOOLCHAIN`：编译工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器；否则，用户需要在cmake时手动设置这些值。无默认值。
 
-其他配置参数：
+- `RPI_ARM_NEON`：是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
 
 - `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
 
-cmake参数如下；
+一个常用的CMake配置如下：
 
 ```
 cmake -DCMAKE_SYSTEM_NAME=RPi \
@@ -47,7 +44,9 @@ cmake -DCMAKE_SYSTEM_NAME=RPi \
       ..
 ```
 
-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
+其中`WITH_C_API=ON`表示需要构建推理库。
+
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。
 
 ## 编译和安装
 
@@ -60,6 +59,4 @@ make install
 
 注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
 
-执行完安装命令后，由于上一步cmake配置中`WITH_C_API`设置为`ON`，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Raspberry Pi版本的库。
-
-更多的编译配置见[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档。
+执行完安装命令后，，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Raspberry Pi版本的库。
diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md b/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md
new file mode 100644
index 0000000000..09ac4733ec
--- /dev/null
+++ b/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md
@@ -0,0 +1,62 @@
+# Build PaddlePaddle for Raspberry Pi
+
+You may use any of the following two approaches to build the inference library of PaddlePaddle for Raspberry Pi:
+
+1. Build using SSH: Log in to a Raspberry Pi using SSH and build the library. The required development tools and third-party dependencies are listed in here: [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).
+
+1. Cross-compile: We talk about how to cross-compile PaddlePaddle for Raspberry Pi on a Linux/x64 machine, in more detail in this article.
+
+## The Cross-Compiling Toolchain
+
+Step 1. Clone the Github repo by running the following command.
+
+```bash
+git clone https://github.com/raspberrypi/tools.git
+```
+
+Step 2. Use the pre-built cross-compiler found in `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`.  To run it on a Linux computer, glibc version >= 2.14 is needed.
+
+## CMake Arguments
+
+CMake supports [cross-compiling](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  All CMake configuration arguments required for the cross-compilation for Raspberry Pi can be found in [`cmake/cross_compiling/raspberry_pi.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake).
+
+Some important arguments that need to be set:
+
+- `CMAKE_SYSTEM_NAME`: The target platform.  Must be `RPi`.
+
+- `RPI_TOOLCHAIN`: The absolute path of the cross-compiling toolchain.
+
+- `RPI_ARM_NEON`: Use ARM NEON Intrinsics. This is a required argument and set default to `ON`.
+
+- `HOST_C/CXX_COMPILER`: The C/C++ compiler for the host.  It is used to build building tools running on the host, for example, protoc.
+
+A commonly-used CMake configuration is as follows:
+
+```
+cmake -DCMAKE_SYSTEM_NAME=RPi \
+      -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \
+      -DRPI_ARM_NEON=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_GPU=OFF \
+      -DWITH_C_API=ON \
+      -DWITH_PYTHON=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+To build the inference library, please set the argument WITH_API to ON: `WITH_C_API=ON`.
+
+You can add more arguments. For example, to minimize the size of the generated inference library, you may use `CMAKE_BUILD_TYPE=MinSizeRel`. For performance optimization, you may use `CMAKE_BUILD_TYPE=Release`.
+
+## Build and Install
+
+The following commands build the inference library of PaddlePaddle for Raspberry Pi and third-party dependencies.
+
+```bash
+make
+make install
+```
+
+ The intermediate files will be stored in `build`. Third-party libraries will be located in `build/third_party`. If you have already built it for other platforms like Android or iOS, you may want to clear these directories by running the command: `rm -rf build`.
+
+The infernece library will be in `your/path/to/install/lib`, with related header files in `your/path/to/install/include`.

From f122a5da2f27038b48f6ed607e296d762050e920 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 30 Oct 2017 19:35:22 -0700
Subject: [PATCH 78/81] Add accuracy layer (#4958)

* Complete accuray layer

* Fix error

* Fix error

* Add 'accuracy' to __all__

* update

* Fix Type error

* Fix error

* Refine unit tests

* Fix an unit test error
---
 paddle/operators/accuracy_op.cc               |  6 +++--
 paddle/operators/top_k_op.cc                  |  9 ++++++--
 python/paddle/v2/framework/layers.py          | 22 ++++++++++++++++++-
 .../v2/framework/tests/test_accuracy_op.py    |  4 ++--
 .../tests/test_recognize_digits_conv.py       | 13 ++++++-----
 5 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index eb8bce8da7..88958e1634 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -32,7 +32,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
     auto inference_dim = ctx->GetInputDim("Inference");
     auto label_dim = ctx->GetInputDim("Label");
 
-    PADDLE_ENFORCE_EQ(label_dim.size(), 1, "label must be a vector");
+    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
+    PADDLE_ENFORCE_EQ(label_dim[1], 1, "label's second dimension must be 1");
     PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0],
                       "inference size must be the same as label size");
 
@@ -68,7 +69,8 @@ information, or not. But the output only shares the LoD with input `Inference`.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker);
+REGISTER_OPERATOR(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
     ops::AccuracyKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index d5c2c91a5f..ac92572595 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -52,7 +52,11 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output tensor of Topk op");
     AddOutput("Indices", "The indices of Topk elements of input");
     AddComment(
-        R"DOC(If the input is a vector (1d tensor), finds the k largest entries in the vector and outputs their values and indices as vectors. Thus values[j] is the j-th largest entry in input, and its index is indices[j].
+        R"DOC(If the input is a vector (1d tensor), 
+        finds the k largest entries in the vector 
+        and outputs their values and indices as vectors. 
+        Thus values[j] is the j-th largest entry in input, 
+        and its index is indices[j].
 
     For matrices, computes the top k entries in each row. )DOC");
     AddAttr<int>("k",
@@ -66,6 +70,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(top_k, ops::TopkOp, ops::TopkOpMaker);
+REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(top_k,
                        ops::TopkKernel<paddle::platform::CPUPlace, float>);
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 70447e0d81..4727d139a2 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -5,7 +5,7 @@ import re
 
 __all__ = [
     'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
-    'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool'
+    'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'accuracy'
 ]
 
 
@@ -229,6 +229,26 @@ def square_error_cost(input, label, **kwargs):
     return square_out
 
 
+def accuracy(input, label, k=1, **kwargs):
+    helper = LayerHelper("accuracy", **kwargs)
+    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    acc_out_dtype = kwargs.get("out_dtype", "float32")
+    acc_out = helper.create_tmp_variable(dtype=acc_out_dtype)
+    helper.append_op(
+        type="accuracy",
+        inputs={"Inference": [topk_indices],
+                "Label": [label]},
+        outputs={"Accuracy": [acc_out]})
+    return acc_out
+
+
 def sequence_conv(input,
                   num_filters,
                   name=None,
diff --git a/python/paddle/v2/framework/tests/test_accuracy_op.py b/python/paddle/v2/framework/tests/test_accuracy_op.py
index 02be9a0291..f17edd44ae 100644
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
@@ -8,12 +8,12 @@ class TestAccuracyOp(OpTest):
         self.op_type = "accuracy"
         n = 8192
         infer = np.random.randint(0, 2, (n, 1)).astype("int")
-        label = np.random.randint(0, 2, (n, )).astype("int")
+        label = np.random.randint(0, 2, (n, 1)).astype("int")
         self.inputs = {'Inference': infer, "Label": label}
         num_correct = 0
         for rowid in xrange(n):
             for ele in infer[rowid]:
-                if ele == label[rowid]:
+                if ele == label[rowid][0]:
                     num_correct += 1
                     break
         self.outputs = {
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
index a9b6c8410e..92b1d05426 100644
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
@@ -51,12 +51,14 @@ predict = layers.fc(input=conv_pool_2,
 cost = layers.cross_entropy(
     input=predict, label=label, program=program, init_program=init_program)
 avg_cost = layers.mean(x=cost, program=program)
+accuracy = layers.accuracy(
+    input=predict, label=label, program=program, init_program=init_program)
 
 sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
 opts = sgd_optimizer.minimize(avg_cost)
 
 BATCH_SIZE = 50
-PASS_NUM = 1
+PASS_NUM = 3
 train_reader = paddle.batch(
     paddle.reader.shuffle(
         paddle.dataset.mnist.train(), buf_size=500),
@@ -83,10 +85,11 @@ for pass_id in range(PASS_NUM):
         outs = exe.run(program,
                        feed={"pixel": tensor_img,
                              "label": tensor_y},
-                       fetch_list=[avg_cost])
-
+                       fetch_list=[avg_cost, accuracy])
         loss = np.array(outs[0])
+        acc = np.array(outs[1])
 
-        if loss < 10.0:
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
+        if loss < 10.0 and acc > 0.9:
+            # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
+            exit(0)
 exit(1)

From 29fe2a01bdf07bdab4182a7989b3300e718331de Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 31 Oct 2017 14:36:38 +0800
Subject: [PATCH 79/81] add comments in test_Expand.cpp

---
 paddle/gserver/tests/test_Expand.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/gserver/tests/test_Expand.cpp b/paddle/gserver/tests/test_Expand.cpp
index a84a518a01..d32bf0152f 100644
--- a/paddle/gserver/tests/test_Expand.cpp
+++ b/paddle/gserver/tests/test_Expand.cpp
@@ -91,6 +91,8 @@ TEST(Layer, ExpandLayerFwd) {
   doOneExpandTest("non-seq", false, useGpu, input1, input2, result);
 
   // CPU case 2. non-seq expand to sub-seq
+  // NOTE: input1.batch_size == input2.sequencelength in this case.
+  // i.e, input1 expands by input2.sequence
   // input1 = 1,2,3
   // input2 = [[4,5]],[[6]],[[7],[8,9]]
   // result = [[1,1]],[[2]],[[3],[3,3]]

From 878dd88f6107fb81a9c9db99abad0f770b8c9d1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= <typhoonzero1986@gmail.com>
Date: Tue, 31 Oct 2017 15:37:23 +0800
Subject: [PATCH 80/81] Refine evaluator op types (#5208)

* refine evaluator op types

* update

* follow comments

* update

* fix v2 mnist case

* fix v2 mnist case

* update

* update
---
 paddle/operators/accuracy_op.cc               | 39 +++++++++++++------
 paddle/operators/accuracy_op.cu               | 24 +++++++-----
 paddle/operators/accuracy_op.h                |  9 +++--
 paddle/operators/auc_op.cc                    | 38 ++++++++++++------
 paddle/operators/auc_op.h                     | 37 ++++++++----------
 python/paddle/v2/framework/layers.py          |  7 +++-
 .../v2/framework/tests/test_accuracy_op.py    | 11 +++---
 .../paddle/v2/framework/tests/test_auc_op.py  | 16 ++++----
 8 files changed, 108 insertions(+), 73 deletions(-)

diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 88958e1634..2a2a1e9cfd 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -22,23 +22,35 @@ class AccuracyOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Inference"),
-                   "Input(Inference) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"),
+                   "Input (Out) of accuracy op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input (Indices) of accuracy op should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input(Label) of AccuracyOp should not be null.");
+                   "Input (Label) of accuracy op should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
-                   "Output(Accuracy) of AccuracyOp should not be null.");
+                   "Output (Accuracy) of AccuracyOp should not be null.");
 
-    auto inference_dim = ctx->GetInputDim("Inference");
+    auto inference_dim = ctx->GetInputDim("Out");
     auto label_dim = ctx->GetInputDim("Label");
+    // Assume indices has same shape with infernece, because
+    // it's the output of topk.
 
     PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
     PADDLE_ENFORCE_EQ(label_dim[1], 1, "label's second dimension must be 1");
     PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0],
-                      "inference size must be the same as label size");
+                      "the inference tensor's num_rows must be"
+                      " the same as label.");
 
     ctx->SetOutputDim("Accuracy", {1});
-    ctx->ShareLoD("Inference", /*->*/ "Accuracy");
+    ctx->ShareLoD("Out", /*->*/ "Accuracy");
+  }
+
+ protected:
+  // IndicateDataType
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Out")->type());
   }
 };
 
@@ -48,7 +60,8 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     // TODO(typhoonzero): support both inference value and indices.
-    AddInput("Inference", "topk(indices) the network output");
+    AddInput("Out", "topk (inferences) the network output");
+    AddInput("Indices", "topk (indices) the network output");
     AddInput("Label", "Label of the training data");
     // TODO(typhoonzero): AddInput("Weight", ...
     AddOutput("Accuracy", "The accuracy of current batch");
@@ -59,7 +72,7 @@ The accuracy is:
 ..  math::
 accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})
 
-Both the input `Inference` and `Label` can carry the LoD (Level of Details)
+Both the input `Out` and `Label` can carry the LoD (Level of Details)
 information, or not. But the output only shares the LoD with input `Inference`.
 )DOC");
   }
@@ -71,6 +84,8 @@ information, or not. But the output only shares the LoD with input `Inference`.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
-    ops::AccuracyKernel<paddle::platform::CPUPlace, int64_t>);
+// FIXME(typhoonzero): types of T is for infernece data.
+// label data is always int.
+REGISTER_OP_CPU_KERNEL(accuracy,
+                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
+                       ops::AccuracyKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index be58dfbd03..a0483f367e 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -21,9 +21,10 @@ namespace paddle {
 namespace operators {
 using platform::PADDLE_CUDA_NUM_THREADS;
 
-template <typename T, int BlockSize>
-__global__ void AccuracyCudaKernel(const int N, const int D, const T* Xdata,
-                                   const T* labeldata, float* accuracy) {
+template <int BlockSize>
+__global__ void AccuracyCudaKernel(const int N, const int D,
+                                   const int64_t* Xdata,
+                                   const int64_t* labeldata, float* accuracy) {
   int count = 0;
   __shared__ int total[BlockSize];
 
@@ -52,13 +53,14 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "It must use GPUPlace.");
-    auto* inference = ctx.Input<Tensor>("Inference");
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* indices = ctx.Input<Tensor>("Indices");
     auto* label = ctx.Input<Tensor>("Label");
     auto* accuracy = ctx.Output<Tensor>("Accuracy");
     // FIXME(typhoonzero): only support indices currently
     // if add support for output values, how to detect the data type?
-    const T* inference_data = inference->data<T>();
-    const T* label_data = label->data<T>();
+    const int64_t* indices_data = indices->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
     float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
 
     size_t num_samples = inference->dims()[0];
@@ -69,11 +71,11 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
       return;
     }
 
-    AccuracyCudaKernel<T, PADDLE_CUDA_NUM_THREADS><<<
+    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
         1, PADDLE_CUDA_NUM_THREADS, 0,
         reinterpret_cast<const platform::CUDADeviceContext&>(
             ctx.device_context())
-            .stream()>>>(num_samples, infer_width, inference_data, label_data,
+            .stream()>>>(num_samples, infer_width, indices_data, label_data,
                          accuracy_data);
   }
 };
@@ -81,5 +83,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<int>,
-                       paddle::operators::AccuracyOpCUDAKernel<int64_t>);
+// FIXME(typhoonzero): types of T is for infernece data.
+// label data is always int
+REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
+                       paddle::operators::AccuracyOpCUDAKernel<double>);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index 12c6b9aac8..1968b53d19 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -38,14 +38,15 @@ template <typename Place, typename T>
 class AccuracyKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Inference");
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* indices = ctx.Input<Tensor>("Indices");
     auto* label = ctx.Input<Tensor>("Label");
     auto* accuracy = ctx.Output<Tensor>("Accuracy");
 
     float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
 
-    const T* inference_data = inference->data<T>();
-    const T* label_data = label->data<T>();
+    const int64_t* indices_data = indices->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
 
     size_t num_samples = inference->dims()[0];
     size_t class_dim = inference->dims()[1];
@@ -60,7 +61,7 @@ class AccuracyKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < num_samples; ++i) {
       PADDLE_ENFORCE_GE(label_data[i], 0, "label must >= 0");
       for (size_t j = 0; j < class_dim; ++j) {
-        if (inference_data[i * class_dim + j] == label_data[i]) {
+        if (indices_data[i * class_dim + j] == label_data[i]) {
           ++num_correct;
           break;
         }
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index cf3dbc5d10..f5784922af 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -23,18 +23,26 @@ class AucOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Inference"),
-                   "Input of Inference must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input of Indices must be initialized.");
     PADDLE_ENFORCE(ctx->HasInput("Label"),
                    "Input of Label must be initialized.");
-    auto inference_dim = ctx->GetInputDim("Inference");
-    auto label_dim = ctx->GetInputDim("Label");
+    auto inference_height = ctx->GetInputDim("Out")[0];
+    auto label_height = ctx->GetInputDim("Label")[0];
 
-    PADDLE_ENFORCE_EQ(inference_dim, label_dim,
-                      "inference and label should have same shape");
+    PADDLE_ENFORCE_EQ(inference_height, label_height,
+                      "Out and Label should have same height.");
 
     ctx->SetOutputDim("AUC", {1});
-    ctx->ShareLoD("Inference", /*->*/ "AUC");
+    ctx->ShareLoD("Out", /*->*/ "AUC");
+  }
+
+ protected:
+  // IndicateDataType
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Out")->type());
   }
 };
 
@@ -42,12 +50,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   AucOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Inference",
-             "A floating point tensor of arbitrary shape and whose values"
-             "are in the range [0, 1].");
+    AddInput("Out",
+             "A floating point 2D tensor, values are in the range [0, 1]."
+             "Each row is descend sorted. This input should be the"
+             "output of topk."
+             "Typically, this tensor indicates the probability of each label");
+    AddInput("Indices",
+             "An int 2D tensor, indicating the indices of original"
+             "tensor before sort. Typically, this tensor indicates which label"
+             "the probability stands for.");
     AddInput("Label",
-             "A tensor whose shape matches "
-             "Inference. Will be cast to bool.");
+             "A 2D int tensor indicating the label of the training data."
+             "The height is batch size and width is always 1.");
     // TODO(typhoonzero): support weight input
     AddOutput("AUC",
               "A scalar representing the "
diff --git a/paddle/operators/auc_op.h b/paddle/operators/auc_op.h
index be6ef29d5f..e5ac57b038 100644
--- a/paddle/operators/auc_op.h
+++ b/paddle/operators/auc_op.h
@@ -29,7 +29,7 @@ template <typename Place, typename T>
 class AucKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Inference");
+    auto* inference = ctx.Input<Tensor>("Out");
     auto* label = ctx.Input<Tensor>("Label");
     auto* auc = ctx.Output<Tensor>("AUC");
 
@@ -46,18 +46,11 @@ class AucKernel : public framework::OpKernel<T> {
     thresholds_list[0] = 0.0f - kEpsilon;
     thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
 
-    size_t num_samples = inference->numel();
+    size_t batch_size = inference->dims()[0];
+    size_t inference_width = inference->dims()[1];
 
     const T* inference_data = inference->data<T>();
-    Tensor label_casted;
-    label_casted.Resize(label->dims());
-    bool* label_casted_data = label_casted.mutable_data<bool>(ctx.GetPlace());
-
-    const int* label_data = label->data<int>();
-    // cast label_data to bool
-    for (size_t i = 0; i < num_samples; i++) {
-      label_casted_data[i] = static_cast<bool>(label_data[i]);
-    }
+    const int64_t* label_data = label->data<int64_t>();
 
     // Create local tensor for storing the curve: TP, FN, TN, FP
     // TODO(typhoonzero): use eigen op to caculate these values.
@@ -68,23 +61,27 @@ class AucKernel : public framework::OpKernel<T> {
     true_negative.Resize({num_thresholds});
     false_positive.Resize({num_thresholds});
 
-    int* tp_data = true_positive.mutable_data<int>(ctx.GetPlace());
-    int* fn_data = false_negative.mutable_data<int>(ctx.GetPlace());
-    int* tn_data = true_negative.mutable_data<int>(ctx.GetPlace());
-    int* fp_data = false_positive.mutable_data<int>(ctx.GetPlace());
+    int64_t* tp_data = true_positive.mutable_data<int64_t>(ctx.GetPlace());
+    int64_t* fn_data = false_negative.mutable_data<int64_t>(ctx.GetPlace());
+    int64_t* tn_data = true_negative.mutable_data<int64_t>(ctx.GetPlace());
+    int64_t* fp_data = false_positive.mutable_data<int64_t>(ctx.GetPlace());
 
     for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
       // caculate TP, FN, TN, FP for current thresh
-      int tp = 0, fn = 0, tn = 0, fp = 0;
-      for (size_t i = 0; i < num_samples; i++) {
-        if (label_casted_data[i]) {
-          if (inference_data[i] >= (thresholds_list[idx_thresh])) {
+      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
+      for (size_t i = 0; i < batch_size; i++) {
+        // NOTE: label_data used as bool, labels >0 will be treated as true.
+        if (label_data[i]) {
+          // use first(max) data in each row
+          if (inference_data[i * inference_width] >=
+              (thresholds_list[idx_thresh])) {
             tp++;
           } else {
             fn++;
           }
         } else {
-          if (inference_data[i] >= (thresholds_list[idx_thresh])) {
+          if (inference_data[i * inference_width] >=
+              (thresholds_list[idx_thresh])) {
             fp++;
           } else {
             tn++;
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 4727d139a2..6451d11e2b 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -243,8 +243,11 @@ def accuracy(input, label, k=1, **kwargs):
     acc_out = helper.create_tmp_variable(dtype=acc_out_dtype)
     helper.append_op(
         type="accuracy",
-        inputs={"Inference": [topk_indices],
-                "Label": [label]},
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
         outputs={"Accuracy": [acc_out]})
     return acc_out
 
diff --git a/python/paddle/v2/framework/tests/test_accuracy_op.py b/python/paddle/v2/framework/tests/test_accuracy_op.py
index f17edd44ae..6536c297e8 100644
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
@@ -7,13 +7,14 @@ class TestAccuracyOp(OpTest):
     def setUp(self):
         self.op_type = "accuracy"
         n = 8192
-        infer = np.random.randint(0, 2, (n, 1)).astype("int")
-        label = np.random.randint(0, 2, (n, 1)).astype("int")
-        self.inputs = {'Inference': infer, "Label": label}
+        infer = np.random.random((n, 1)).astype("float32")
+        indices = np.random.randint(0, 2, (n, 1))
+        label = np.random.randint(0, 2, (n, 1))
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
         num_correct = 0
         for rowid in xrange(n):
-            for ele in infer[rowid]:
-                if ele == label[rowid][0]:
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
                     num_correct += 1
                     break
         self.outputs = {
diff --git a/python/paddle/v2/framework/tests/test_auc_op.py b/python/paddle/v2/framework/tests/test_auc_op.py
index 65f679cfcc..26ea905d88 100644
--- a/python/paddle/v2/framework/tests/test_auc_op.py
+++ b/python/paddle/v2/framework/tests/test_auc_op.py
@@ -6,10 +6,11 @@ from op_test import OpTest
 class TestAucOp(OpTest):
     def setUp(self):
         self.op_type = "auc"
-        pred = np.random.random((128)).astype("float32")
-        labels = np.random.randint(0, 2, (128, ))
+        pred = np.random.random((128, 2)).astype("float32")
+        indices = np.random.randint(0, 2, (128, 2))
+        labels = np.random.randint(0, 2, (128, 1))
         num_thresholds = 200
-        self.inputs = {'Inference': pred, 'Label': labels}
+        self.inputs = {'Out': pred, 'Indices': indices, 'Label': labels}
         self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
         # NOTE: sklearn use a different way to generate thresholds
         #       which will cause the result differs slightly:
@@ -31,12 +32,12 @@ class TestAucOp(OpTest):
             tp, fn, tn, fp = 0, 0, 0, 0
             for i, lbl in enumerate(labels):
                 if lbl:
-                    if pred[i] >= thresh:
+                    if pred[i, 0] >= thresh:
                         tp += 1
                     else:
                         fn += 1
                 else:
-                    if pred[i] >= thresh:
+                    if pred[i, 0] >= thresh:
                         fp += 1
                     else:
                         tn += 1
@@ -62,6 +63,5 @@ class TestAucOp(OpTest):
         self.check_output()
 
 
-# TODO(typhoonzero): add this back till we fix it
-#if __name__ == "__main__":
-#    unittest.main()
+if __name__ == "__main__":
+    unittest.main()

From 1a690279331b39fc20b43ac1e01e88c8504e7110 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 31 Oct 2017 18:26:26 +0800
Subject: [PATCH 81/81] correct the index of cluster_train_cn/en.md

---
 doc/howto/usage/cluster/cluster_train_cn.md | 36 ++++++++++-----------
 doc/howto/usage/cluster/cluster_train_en.md | 36 ++++++++++-----------
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index 93c5544bcf..2e98b3de3f 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -19,7 +19,7 @@
      * [启动集群作业](#启动集群作业-1)
   * [在Kubernetes集群中提交训练作业](#在kubernetes集群中提交训练作业)
 
-# 概述
+## 概述
 本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
 
 <img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
@@ -32,7 +32,7 @@
 
 在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
 
-# 环境准备
+## 环境准备
 
 1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
 1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。
@@ -51,8 +51,8 @@ PaddlePaddle 0.10.0, compiled with
 
 下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
 
-# 启动参数说明
-## 启动参数服务器
+## 启动参数说明
+### 启动参数服务器
 执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
 ```bash
 $ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
@@ -70,7 +70,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
 | ports_num_for_sparse  | 必选 | 1 | 用于稀疏类型参数通信的端口个数  |
 | num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
 
-## 启动计算节点
+### 启动计算节点
 执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
 ```bash
 $ python train.py
@@ -117,7 +117,7 @@ paddle.init(
 | pservers  | 必选 | 127.0.0.1 | 当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开 |
 
 
-## 准备数据集
+### 准备数据集
 
 参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)，准备训练数据和验证数据集，我们使用paddle.dataset.imikolov数据集，并根据分布式训练并发数（trainer节点个数），在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。
 
@@ -149,7 +149,7 @@ test.txt-00002
 
 对于不同的训练任务，训练数据格式和训练程序的`reader()`会大不相同，所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。
 
-## 准备训练程序
+### 准备训练程序
 
 我们会对每个训练任务都会在每个节点上创建一个工作空间（workspace），其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。
 
@@ -184,7 +184,7 @@ test.txt-00002
 - `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
 - `test_data_dir`：包含测试数据集的目录。
 
-# 使用分布式计算平台或工具
+## 使用分布式计算平台或工具
 
 PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
 - [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
@@ -195,12 +195,12 @@ PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务
 
 在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
 
-## 使用Fabric启动集群作业
+### 使用Fabric启动集群作业
 
-### 准备一个Linux集群
+#### 准备一个Linux集群
 可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
 
-### 启动集群作业
+#### 启动集群作业
 
 `paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
 
@@ -216,10 +216,10 @@ sh run.sh
 
 集群作业将会在几秒后启动。
 
-### 终止集群作业
+#### 终止集群作业
 `paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
 
-### 检查集群训练结果
+#### 检查集群训练结果
 详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
 
 `paddle_trainer.INFO`
@@ -234,13 +234,13 @@ sh run.sh
 `train.log`
 提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
 
-### 检查模型输出
+#### 检查模型输出
 运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
 工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
 
-## 在OpenMPI集群中提交训练作业
+### 在OpenMPI集群中提交训练作业
 
-### 准备OpenMPI集群
+#### 准备OpenMPI集群
 
 执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
 
@@ -252,7 +252,7 @@ kubectl create -f mpi-nodes.yaml
 
 然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
 
-### 启动集群作业
+#### 启动集群作业
 
 您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
 
@@ -280,6 +280,6 @@ scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
 mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
 ```
 
-## 在Kubernetes集群中提交训练作业
+### 在Kubernetes集群中提交训练作业
 
 此部分的使用方法可以参考[here](../k8s/k8s_distributed_cn.md)。
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index 1e8b4d54b9..baa97c0c02 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -19,7 +19,7 @@
       * [Launching Cluster Job](#launching-cluster-job-1)
    * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes)
 
-# Introduction
+## Introduction
 
 In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
 
@@ -33,7 +33,7 @@ PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and
 
 When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
 
-# Preparations
+## Preparations
 1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
 2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
 
@@ -52,9 +52,9 @@ PaddlePaddle 0.10.0rc, compiled with
 
 We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
 
-# Command-line arguments
+## Command-line arguments
 
-## Starting parameter server
+### Starting parameter server
 
 Type the below command to start a parameter server which will wait for trainers to connect:
 
@@ -74,7 +74,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
 | ports_num_for_sparse  | required | 1 | number of ports which serves sparse parameter update  |
 | num_gradient_servers  | required | 1 | total number of gradient servers |
 
-## Starting trainer
+### Starting trainer
 Type the command below to start the trainer(name the file whatever you want, like "train.py")
 
 ```bash
@@ -122,7 +122,7 @@ paddle.init(
 | trainer_id  | required | 0 | ID for every trainer, start from 0 |
 | pservers  | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," |
 
-## Prepare Training Dataset
+### Prepare Training Dataset
 
 Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
 
@@ -155,7 +155,7 @@ When job started, every trainer needs to get it's own part of data. In some dist
 
 Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.
 
-## Prepare Training program
+### Prepare Training program
 
 We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.
 
@@ -191,7 +191,7 @@ Your workspace may looks like:
 - `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
 - `test_data_dir`: containing testing data.
 
-# Use cluster platforms or cluster management tools
+## Use cluster platforms or cluster management tools
 
 PaddlePaddle supports running jobs on several platforms including:
 - [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
@@ -202,13 +202,13 @@ We'll introduce cluster job management on these platforms. The examples can be f
 
 These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
 
-## Cluster Training Using Fabric
+### Cluster Training Using Fabric
 
-### Prepare a Linux cluster
+#### Prepare a Linux cluster
 
 Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
 
-### Launching Cluster Job
+#### Launching Cluster Job
 `paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
 
 `paddle.py`provides two distinguished command option for easy job launching.
@@ -224,10 +224,10 @@ sh run.sh
 
 The cluster Job will start in several seconds.
 
-### Kill Cluster Job
+#### Kill Cluster Job
 `paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
 
-### Check Cluster Training Result
+#### Check Cluster Training Result
 Check log in $workspace/log for details, each node owns same log structure.
 
 `paddle_trainer.INFO`
@@ -242,13 +242,13 @@ It provides stderr and stdout of parameter server process. Check error log if tr
 `train.log`
 It provides stderr and stdout of trainer process. Check error log if training crashes.
 
-### Check Model Output
+#### Check Model Output
 After one pass finished, model files will be written in `output` directory in node 0.
 `nodefile` in workspace indicates the node id of current cluster job.
 
-## Cluster Training Using OpenMPI
+### Cluster Training Using OpenMPI
 
-### Prepare an OpenMPI cluster
+#### Prepare an OpenMPI cluster
 
 Run the following command to start a 3-node MPI cluster and one "head" node.
 
@@ -260,7 +260,7 @@ kubectl create -f mpi-nodes.yaml
 
 Then you can log in to every OpenMPI node using ssh without input any passwords.
 
-### Launching Cluster Job
+#### Launching Cluster Job
 
 Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
 
@@ -288,6 +288,6 @@ scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
 mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
 ```
 
-## Cluster Training Using Kubernetes
+### Cluster Training Using Kubernetes
 
 The details can be found [here](../k8s/k8s_cn.md)