From 540cc2c1c1a203758346cd2ce226d7564c0dad88 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 29 Sep 2017 22:11:48 -0700
Subject: [PATCH 01/61] add executor class and interface

---
 paddle/framework/CMakeLists.txt   |   2 +
 paddle/framework/executor.cc      | 108 ++++++++++++++++++++++++++++++
 paddle/framework/executor.h       |  32 +++++++++
 paddle/framework/executor_test.cc |  18 +++++
 4 files changed, 160 insertions(+)
 create mode 100644 paddle/framework/executor.cc
 create mode 100644 paddle/framework/executor.h
 create mode 100644 paddle/framework/executor_test.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8a5d8532bb..3ee721ac93 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -43,3 +43,5 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
+
+cc_library(executor SRCS executor.cc DEPS device_context framework_proto)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
new file mode 100644
index 0000000000..ccf6716949
--- /dev/null
+++ b/paddle/framework/executor.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/executor.h"
+
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+class LinearListView;
+class GraphView;
+
+// Immutable view of a ProgramDesc organized for efficient execution.
+class ProgramDescView {
+ public:
+  virtual ~ProgramDescView() {}
+  virtual void Initialize(const ProgramDesc*) = 0;
+  static ProgramDescView* Create(bool is_linear);
+};
+
+class LinearListView : public ProgramDescView {
+ public:
+  void Initialize(const ProgramDesc*) override;
+};
+
+class GraphView : public ProgramDescView {
+ public:
+  void Initialize(const ProgramDesc*) override;
+};
+
+static ProgramDescView* Create(bool is_linear) {
+  if (is_linear) {
+    return new LinearListView();
+  } else {
+    return new GraphView();
+  }
+}
+
+void LinearListView::Initialize(const ProgramDesc*) {
+  // get a LinearView of ProgramDesc
+}
+
+void GraphView::Initialize(const ProgramDesc*) {
+  // get a GraphView of ProgramDesc
+}
+
+class ExecutorImpl : public Executor {
+ public:
+  ExecutorImpl(const platform::DeviceContext* ctx, const ProgramDesc* pdesc,
+               bool is_linear)
+      : device_context_(ctx),
+        program_desc_(pdesc),
+        view_(ProgramDescView::Create(is_linear)) {}
+
+  virtual ~ExecutorImpl() {
+    if (view_) delete view_;
+  }
+
+  void Run() override;
+
+  void Initialize();
+
+ private:
+  const platform::DeviceContext* device_context_;
+  const ProgramDesc* program_desc_;
+  ProgramDescView* view_;
+};
+
+static Executor* NewLocalExecutor(const platform::Place& place,
+                                  const ProgramDesc& pdesc, bool is_linear) {
+  platform::DeviceContext* device_context = nullptr;
+  if (platform::is_cpu_place(place)) {
+    device_context =
+        new platform::CPUDeviceContext(boost::get<platform::CPUPlace>(place));
+  }
+#ifndef PADDLE_ONLY_CPU
+  else if {
+    device_context =
+        new platform::CUDADeviceContext(boost::get<platform::GPUPlace>(place));
+  }
+#endif
+  return new ExecutorImpl(device_context, &pdesc, is_linear);
+}
+
+void ExecutorImpl::Run() {
+  // operators running
+  device_context_->Wait();
+}
+
+void ExecutorImpl::Initialize() {
+  // Initialize the ProgramDescView
+  view_->Initialize(program_desc_);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
new file mode 100644
index 0000000000..69f0e3f18f
--- /dev/null
+++ b/paddle/framework/executor.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/framework.pb.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+class Executor {
+ public:
+  virtual ~Executor() {}
+  virtual void Run() = 0;
+};
+
+static Executor* NewLocalExecutor(const platform::Place&, const ProgramDesc&);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
new file mode 100644
index 0000000000..f8a41b12ad
--- /dev/null
+++ b/paddle/framework/executor_test.cc
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/executor.h"
+#include "gtest/gtest.h"
+
+TEST(Executor, Init) {}
\ No newline at end of file

From 3481bdc865571d2cfac1576d0913ab3f827b5955 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 29 Sep 2017 22:32:41 -0700
Subject: [PATCH 02/61] add global device context

---
 paddle/framework/executor.cc | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index ccf6716949..8534e70f48 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/executor.h"
-
+#include <memory>
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
@@ -78,17 +78,28 @@ class ExecutorImpl : public Executor {
   ProgramDescView* view_;
 };
 
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+static std::unique_ptr<platform::CPUDeviceContext> g_cpu_device_context =
+    make_unique<platform::CPUDeviceContext>(platform::CPUPlace());
+
+#ifndef PADDLE_ONLY_CPU
+static std::unique_ptr<platform::CUDADeviceContext> g_cuda_device_context =
+    make_unique<platform::CUDADeviceContext>(platform::GPUPlace(0));
+#endif
+
 static Executor* NewLocalExecutor(const platform::Place& place,
                                   const ProgramDesc& pdesc, bool is_linear) {
   platform::DeviceContext* device_context = nullptr;
   if (platform::is_cpu_place(place)) {
-    device_context =
-        new platform::CPUDeviceContext(boost::get<platform::CPUPlace>(place));
+    device_context = g_cpu_device_context.get();
   }
 #ifndef PADDLE_ONLY_CPU
   else if {
-    device_context =
-        new platform::CUDADeviceContext(boost::get<platform::GPUPlace>(place));
+    device_context = g_cuda_device_context.get();
   }
 #endif
   return new ExecutorImpl(device_context, &pdesc, is_linear);

From e42cafb24f3868713958213777d798cd54140b40 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 29 Sep 2017 22:50:40 -0700
Subject: [PATCH 03/61] add executor unittest

---
 paddle/framework/CMakeLists.txt   |  1 +
 paddle/framework/executor.cc      |  6 +++---
 paddle/framework/executor.h       |  2 +-
 paddle/framework/executor_test.cc | 10 +++++++++-
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 3ee721ac93..2cad2e54fa 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -45,3 +45,4 @@ cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
 cc_library(executor SRCS executor.cc DEPS device_context framework_proto)
+cc_test(executor_test SRCS executor_test.cc DEPS executor)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 8534e70f48..7fda2332b8 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -40,7 +40,7 @@ class GraphView : public ProgramDescView {
   void Initialize(const ProgramDesc*) override;
 };
 
-static ProgramDescView* Create(bool is_linear) {
+ProgramDescView* ProgramDescView::Create(bool is_linear) {
   if (is_linear) {
     return new LinearListView();
   } else {
@@ -91,8 +91,8 @@ static std::unique_ptr<platform::CUDADeviceContext> g_cuda_device_context =
     make_unique<platform::CUDADeviceContext>(platform::GPUPlace(0));
 #endif
 
-static Executor* NewLocalExecutor(const platform::Place& place,
-                                  const ProgramDesc& pdesc, bool is_linear) {
+Executor* NewLocalExecutor(const platform::Place& place,
+                           const ProgramDesc& pdesc, bool is_linear) {
   platform::DeviceContext* device_context = nullptr;
   if (platform::is_cpu_place(place)) {
     device_context = g_cpu_device_context.get();
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index 69f0e3f18f..25ef2d4d48 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -26,7 +26,7 @@ class Executor {
   virtual void Run() = 0;
 };
 
-static Executor* NewLocalExecutor(const platform::Place&, const ProgramDesc&);
+Executor* NewLocalExecutor(const platform::Place&, const ProgramDesc&, bool);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index f8a41b12ad..c046ae3158 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -15,4 +15,12 @@ limitations under the License. */
 #include "paddle/framework/executor.h"
 #include "gtest/gtest.h"
 
-TEST(Executor, Init) {}
\ No newline at end of file
+using namespace paddle::platform;
+using namespace paddle::framework;
+
+TEST(Executor, Init) {
+  ProgramDesc pdesc;
+  CPUPlace cpu_place;
+  Executor* executor = NewLocalExecutor(cpu_place, pdesc, true);
+  executor->Run();
+}
\ No newline at end of file

From d4be9730fced2a8effaf06412fa48e2aa0a8c325 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 29 Sep 2017 23:44:52 -0700
Subject: [PATCH 04/61] fix gpu build error

---
 paddle/framework/executor.cc      | 26 +++++++++++++++++---------
 paddle/framework/executor_test.cc |  1 +
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 7fda2332b8..b38d6be16f 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -80,26 +80,34 @@ class ExecutorImpl : public Executor {
 
 template <typename T, typename... Args>
 std::unique_ptr<T> make_unique(Args&&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+      return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
 }
 
-static std::unique_ptr<platform::CPUDeviceContext> g_cpu_device_context =
-    make_unique<platform::CPUDeviceContext>(platform::CPUPlace());
+platform::CPUDeviceContext* GetCPUDeviceContext() {
+  static std::unique_ptr<platform::CPUDeviceContext> g_cpu_device_context =
+        make_unique<platform::CPUDeviceContext>(platform::CPUPlace());
+  return g_cpu_device_context.get();
+}
 
 #ifndef PADDLE_ONLY_CPU
-static std::unique_ptr<platform::CUDADeviceContext> g_cuda_device_context =
-    make_unique<platform::CUDADeviceContext>(platform::GPUPlace(0));
+platform::CUDADeviceContext* GetCUDADeviceContext() {
+  static std::unique_ptr<platform::CUDADeviceContext> g_cuda_device_context =
+          make_unique<platform::CUDADeviceContext>(platform::GPUPlace(0));
+  return g_cuda_device_context.get();
+}
 #endif
 
 Executor* NewLocalExecutor(const platform::Place& place,
                            const ProgramDesc& pdesc, bool is_linear) {
   platform::DeviceContext* device_context = nullptr;
   if (platform::is_cpu_place(place)) {
-    device_context = g_cpu_device_context.get();
-  }
+    device_context = GetCPUDeviceContext();
+  } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_ONLY_CPU
-  else if {
-    device_context = g_cuda_device_context.get();
+    device_context = GetCUDADeviceContext();
+  }
+#else
+    PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
   }
 #endif
   return new ExecutorImpl(device_context, &pdesc, is_linear);
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index c046ae3158..6f8ca38768 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -23,4 +23,5 @@ TEST(Executor, Init) {
   CPUPlace cpu_place;
   Executor* executor = NewLocalExecutor(cpu_place, pdesc, true);
   executor->Run();
+  delete executor;
 }
\ No newline at end of file

From b630d4019a0bad74d694633930180912ec19a67c Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Sat, 30 Sep 2017 15:52:05 -0700
Subject: [PATCH 05/61] add scope

---
 paddle/framework/CMakeLists.txt |  2 +-
 paddle/framework/executor.cc    | 24 +++++++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 2cad2e54fa..df79bc0e8f 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -44,5 +44,5 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-cc_library(executor SRCS executor.cc DEPS device_context framework_proto)
+cc_library(executor SRCS executor.cc DEPS device_context scope framework_proto)
 cc_test(executor_test SRCS executor_test.cc DEPS executor)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index b38d6be16f..52963d20f0 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/executor.h"
 #include <memory>
+#include "paddle/framework/scope.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
@@ -58,9 +59,10 @@ void GraphView::Initialize(const ProgramDesc*) {
 
 class ExecutorImpl : public Executor {
  public:
-  ExecutorImpl(const platform::DeviceContext* ctx, const ProgramDesc* pdesc,
-               bool is_linear)
-      : device_context_(ctx),
+  ExecutorImpl(Scope* scope, const platform::DeviceContext* ctx,
+               const ProgramDesc* pdesc, bool is_linear)
+      : scope_(scope),
+        device_context_(ctx),
         program_desc_(pdesc),
         view_(ProgramDescView::Create(is_linear)) {}
 
@@ -73,6 +75,7 @@ class ExecutorImpl : public Executor {
   void Initialize();
 
  private:
+  Scope* scope_;
   const platform::DeviceContext* device_context_;
   const ProgramDesc* program_desc_;
   ProgramDescView* view_;
@@ -80,23 +83,29 @@ class ExecutorImpl : public Executor {
 
 template <typename T, typename... Args>
 std::unique_ptr<T> make_unique(Args&&... args) {
-      return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
 }
 
 platform::CPUDeviceContext* GetCPUDeviceContext() {
   static std::unique_ptr<platform::CPUDeviceContext> g_cpu_device_context =
-        make_unique<platform::CPUDeviceContext>(platform::CPUPlace());
+      make_unique<platform::CPUDeviceContext>(platform::CPUPlace());
   return g_cpu_device_context.get();
 }
 
 #ifndef PADDLE_ONLY_CPU
 platform::CUDADeviceContext* GetCUDADeviceContext() {
   static std::unique_ptr<platform::CUDADeviceContext> g_cuda_device_context =
-          make_unique<platform::CUDADeviceContext>(platform::GPUPlace(0));
+      make_unique<platform::CUDADeviceContext>(platform::GPUPlace(0));
   return g_cuda_device_context.get();
 }
 #endif
 
+framework::Scope* GetScope() {
+  static std::unique_ptr<framework::Scope> g_scope =
+      make_unique<framework::Scope>();
+  return g_scope.get();
+}
+
 Executor* NewLocalExecutor(const platform::Place& place,
                            const ProgramDesc& pdesc, bool is_linear) {
   platform::DeviceContext* device_context = nullptr;
@@ -110,11 +119,12 @@ Executor* NewLocalExecutor(const platform::Place& place,
     PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
   }
 #endif
-  return new ExecutorImpl(device_context, &pdesc, is_linear);
+  return new ExecutorImpl(GetScope(), device_context, &pdesc, is_linear);
 }
 
 void ExecutorImpl::Run() {
   // operators running
+  scope_->NewVar();
   device_context_->Wait();
 }
 

From 09500917eee2f3f991b1f92acbb4738d3ea5dba2 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Sat, 30 Sep 2017 16:44:55 -0700
Subject: [PATCH 06/61] pass place to GetCUDADeviceContext

---
 paddle/framework/executor.cc | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 52963d20f0..74153f2449 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -86,16 +86,16 @@ std::unique_ptr<T> make_unique(Args&&... args) {
   return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
 }
 
-platform::CPUDeviceContext* GetCPUDeviceContext() {
+platform::CPUDeviceContext* GetCPUDeviceContext(platform::CPUPlace& place) {
   static std::unique_ptr<platform::CPUDeviceContext> g_cpu_device_context =
-      make_unique<platform::CPUDeviceContext>(platform::CPUPlace());
+      make_unique<platform::CPUDeviceContext>(place);
   return g_cpu_device_context.get();
 }
 
 #ifndef PADDLE_ONLY_CPU
-platform::CUDADeviceContext* GetCUDADeviceContext() {
+platform::CUDADeviceContext* GetCUDADeviceContext(platform::GPUPlace& place) {
   static std::unique_ptr<platform::CUDADeviceContext> g_cuda_device_context =
-      make_unique<platform::CUDADeviceContext>(platform::GPUPlace(0));
+      make_unique<platform::CUDADeviceContext>(place);
   return g_cuda_device_context.get();
 }
 #endif
@@ -110,10 +110,12 @@ Executor* NewLocalExecutor(const platform::Place& place,
                            const ProgramDesc& pdesc, bool is_linear) {
   platform::DeviceContext* device_context = nullptr;
   if (platform::is_cpu_place(place)) {
-    device_context = GetCPUDeviceContext();
+    auto cpu_place = boost::get<platform::CPUPlace>(place);
+    device_context = GetCPUDeviceContext(cpu_place);
   } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_ONLY_CPU
-    device_context = GetCUDADeviceContext();
+    auto gpu_place = boost::get<platform::GPUPlace>(place);
+    device_context = GetCUDADeviceContext(gpu_place);
   }
 #else
     PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");

From ce4d14b4ed5384dc5fb9eb4e2c6d7f1c6b9bc6dd Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Sun, 1 Oct 2017 15:08:20 -0700
Subject: [PATCH 07/61] add struct Device

---
 paddle/framework/CMakeLists.txt |  2 +-
 paddle/framework/executor.cc    | 73 ++++++++++++++++++++++-----------
 2 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 1168fc38af..129a0eb707 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -44,5 +44,5 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-cc_library(executor SRCS executor.cc DEPS device_context scope framework_proto)
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto)
 cc_test(executor_test SRCS executor_test.cc DEPS executor)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 74153f2449..559cbe125f 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/framework/executor.h"
 #include <memory>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
 #include "paddle/platform/device_context.h"
 
@@ -34,6 +36,9 @@ class ProgramDescView {
 class LinearListView : public ProgramDescView {
  public:
   void Initialize(const ProgramDesc*) override;
+
+ private:
+  std::vector<std::unique_ptr<OperatorBase>> ops_;
 };
 
 class GraphView : public ProgramDescView {
@@ -49,20 +54,36 @@ ProgramDescView* ProgramDescView::Create(bool is_linear) {
   }
 }
 
-void LinearListView::Initialize(const ProgramDesc*) {
+void LinearListView::Initialize(const ProgramDesc* pdesc) {
   // get a LinearView of ProgramDesc
+  for (auto& block_desc : pdesc->blocks()) {
+    for (auto& op_desc : block_desc.ops()) {
+      ops_.emplace_back(OpRegistry::CreateOp(op_desc));
+    }
+  }
 }
 
-void GraphView::Initialize(const ProgramDesc*) {
+void GraphView::Initialize(const ProgramDesc* pdesc) {
   // get a GraphView of ProgramDesc
 }
 
+struct Device {
+  platform::CPUDeviceContext* cpu_device_context;
+#ifndef PADDLE_ONLY_CPU
+  Device(platform::CPUDeviceContext* cpu, platform::CUDADeviceContext* gpu)
+      : cpu_device_context(cpu), cuda_device_context(gpu) {}
+  platform::CDUADeviceContext* cuda_device_context;
+#else
+  explicit Device(platform::CPUDeviceContext* cpu) : cpu_device_context(cpu) {}
+#endif
+};
+
 class ExecutorImpl : public Executor {
  public:
-  ExecutorImpl(Scope* scope, const platform::DeviceContext* ctx,
-               const ProgramDesc* pdesc, bool is_linear)
+  ExecutorImpl(Scope* scope, const Device* device, const ProgramDesc* pdesc,
+               bool is_linear)
       : scope_(scope),
-        device_context_(ctx),
+        device_(device),
         program_desc_(pdesc),
         view_(ProgramDescView::Create(is_linear)) {}
 
@@ -76,7 +97,7 @@ class ExecutorImpl : public Executor {
 
  private:
   Scope* scope_;
-  const platform::DeviceContext* device_context_;
+  const Device* device_;
   const ProgramDesc* program_desc_;
   ProgramDescView* view_;
 };
@@ -86,20 +107,36 @@ std::unique_ptr<T> make_unique(Args&&... args) {
   return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
 }
 
-platform::CPUDeviceContext* GetCPUDeviceContext(platform::CPUPlace& place) {
+platform::CPUDeviceContext* GetCPUDeviceContext(
+    const platform::CPUPlace& place) {
   static std::unique_ptr<platform::CPUDeviceContext> g_cpu_device_context =
       make_unique<platform::CPUDeviceContext>(place);
   return g_cpu_device_context.get();
 }
 
 #ifndef PADDLE_ONLY_CPU
-platform::CUDADeviceContext* GetCUDADeviceContext(platform::GPUPlace& place) {
+platform::CUDADeviceContext* GetCUDADeviceContext(
+    const platform::GPUPlace& place) {
   static std::unique_ptr<platform::CUDADeviceContext> g_cuda_device_context =
       make_unique<platform::CUDADeviceContext>(place);
   return g_cuda_device_context.get();
 }
 #endif
 
+Device* GetDevice(const platform::Place& place) {
+  platform::CPUPlace cpu_place;
+#ifndef PADDLE_ONLY_CPU
+  platform::GPUPlace gpu_place = boost::get<platform::GPUPlace>(place);
+  static std::unique_ptr<Device> g_device = make_unique<Device>(
+      GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place));
+  return g_device.get();
+#else
+  static std::unique_ptr<Device> g_device =
+      make_unique<Device>(GetCPUDeviceContext(cpu_place));
+  return g_device.get();
+#endif
+}
+
 framework::Scope* GetScope() {
   static std::unique_ptr<framework::Scope> g_scope =
       make_unique<framework::Scope>();
@@ -108,26 +145,16 @@ framework::Scope* GetScope() {
 
 Executor* NewLocalExecutor(const platform::Place& place,
                            const ProgramDesc& pdesc, bool is_linear) {
-  platform::DeviceContext* device_context = nullptr;
-  if (platform::is_cpu_place(place)) {
-    auto cpu_place = boost::get<platform::CPUPlace>(place);
-    device_context = GetCPUDeviceContext(cpu_place);
-  } else if (platform::is_gpu_place(place)) {
-#ifndef PADDLE_ONLY_CPU
-    auto gpu_place = boost::get<platform::GPUPlace>(place);
-    device_context = GetCUDADeviceContext(gpu_place);
-  }
-#else
-    PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
-  }
-#endif
-  return new ExecutorImpl(GetScope(), device_context, &pdesc, is_linear);
+  return new ExecutorImpl(GetScope(), GetDevice(place), &pdesc, is_linear);
 }
 
 void ExecutorImpl::Run() {
   // operators running
   scope_->NewVar();
-  device_context_->Wait();
+  device_->cpu_device_context->Wait();
+#ifndef PADDLE_ONLY_CPU
+  device_->cuda_device_context->Wait();
+#endif
 }
 
 void ExecutorImpl::Initialize() {

From f29a6b020f633e7c69ae487b7372146c28046597 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Sun, 1 Oct 2017 15:24:18 -0700
Subject: [PATCH 08/61] fix gpu build error

---
 paddle/framework/executor.cc | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 559cbe125f..ebe3259bc0 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -69,10 +69,13 @@ void GraphView::Initialize(const ProgramDesc* pdesc) {
 
 struct Device {
   platform::CPUDeviceContext* cpu_device_context;
+#ifndef PADDLE_ONLY_CPU
+  platform::CUDADeviceContext* cuda_device_context;
+#endif
+
 #ifndef PADDLE_ONLY_CPU
   Device(platform::CPUDeviceContext* cpu, platform::CUDADeviceContext* gpu)
       : cpu_device_context(cpu), cuda_device_context(gpu) {}
-  platform::CDUADeviceContext* cuda_device_context;
 #else
   explicit Device(platform::CPUDeviceContext* cpu) : cpu_device_context(cpu) {}
 #endif
@@ -126,10 +129,16 @@ platform::CUDADeviceContext* GetCUDADeviceContext(
 Device* GetDevice(const platform::Place& place) {
   platform::CPUPlace cpu_place;
 #ifndef PADDLE_ONLY_CPU
-  platform::GPUPlace gpu_place = boost::get<platform::GPUPlace>(place);
-  static std::unique_ptr<Device> g_device = make_unique<Device>(
-      GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place));
-  return g_device.get();
+  if (platform::is_gpu_place(place)) {
+    platform::GPUPlace gpu_place = boost::get<platform::GPUPlace>(place);
+    static std::unique_ptr<Device> g_device = make_unique<Device>(
+        GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place));
+    return g_device.get();
+  } else {
+    static std::unique_ptr<Device> g_device =
+        make_unique<Device>(GetCPUDeviceContext(cpu_place), nullptr);
+    return g_device.get();
+  }
 #else
   static std::unique_ptr<Device> g_device =
       make_unique<Device>(GetCPUDeviceContext(cpu_place));
@@ -153,7 +162,9 @@ void ExecutorImpl::Run() {
   scope_->NewVar();
   device_->cpu_device_context->Wait();
 #ifndef PADDLE_ONLY_CPU
-  device_->cuda_device_context->Wait();
+  if (device_->cuda_device_context) {
+    device_->cuda_device_context->Wait();
+  }
 #endif
 }
 

From b5dbe88b5ab504f88c6e7eaaa8b27d3965701478 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 2 Oct 2017 20:26:17 -0700
Subject: [PATCH 09/61] follow comments

---
 paddle/framework/CMakeLists.txt   |   2 +-
 paddle/framework/executor.cc      | 159 +++---------------------------
 paddle/framework/executor.h       |  14 ++-
 paddle/framework/executor_test.cc |  12 ++-
 paddle/platform/CMakeLists.txt    |   2 +
 paddle/platform/device.cc         |  59 +++++++++++
 paddle/platform/device.h          |  45 +++++++++
 7 files changed, 139 insertions(+), 154 deletions(-)
 create mode 100644 paddle/platform/device.cc
 create mode 100644 paddle/platform/device.h

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 984fc62aa3..506d0f9833 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -44,5 +44,5 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto)
+cc_library(executor SRCS executor.cc DEPS op_registry device scope framework_proto)
 cc_test(executor_test SRCS executor_test.cc DEPS executor)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index ebe3259bc0..57e177bb0a 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -15,162 +15,31 @@ limitations under the License. */
 #include "paddle/framework/executor.h"
 #include <memory>
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
-#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
 
-class LinearListView;
-class GraphView;
-
-// Immutable view of a ProgramDesc organized for efficient execution.
-class ProgramDescView {
- public:
-  virtual ~ProgramDescView() {}
-  virtual void Initialize(const ProgramDesc*) = 0;
-  static ProgramDescView* Create(bool is_linear);
-};
-
-class LinearListView : public ProgramDescView {
- public:
-  void Initialize(const ProgramDesc*) override;
-
- private:
-  std::vector<std::unique_ptr<OperatorBase>> ops_;
-};
-
-class GraphView : public ProgramDescView {
- public:
-  void Initialize(const ProgramDesc*) override;
-};
-
-ProgramDescView* ProgramDescView::Create(bool is_linear) {
-  if (is_linear) {
-    return new LinearListView();
-  } else {
-    return new GraphView();
-  }
-}
-
-void LinearListView::Initialize(const ProgramDesc* pdesc) {
-  // get a LinearView of ProgramDesc
-  for (auto& block_desc : pdesc->blocks()) {
-    for (auto& op_desc : block_desc.ops()) {
-      ops_.emplace_back(OpRegistry::CreateOp(op_desc));
-    }
+Executor::Executor(const std::vector<platform::Place>& places) {
+  devices_.resize(places.size());
+  for (size_t i = 0; i < places.size(); i++) {
+    devices_[i] = platform::GetDevice(places[i]);
   }
 }
 
-void GraphView::Initialize(const ProgramDesc* pdesc) {
-  // get a GraphView of ProgramDesc
-}
-
-struct Device {
-  platform::CPUDeviceContext* cpu_device_context;
-#ifndef PADDLE_ONLY_CPU
-  platform::CUDADeviceContext* cuda_device_context;
-#endif
-
-#ifndef PADDLE_ONLY_CPU
-  Device(platform::CPUDeviceContext* cpu, platform::CUDADeviceContext* gpu)
-      : cpu_device_context(cpu), cuda_device_context(gpu) {}
-#else
-  explicit Device(platform::CPUDeviceContext* cpu) : cpu_device_context(cpu) {}
-#endif
-};
-
-class ExecutorImpl : public Executor {
- public:
-  ExecutorImpl(Scope* scope, const Device* device, const ProgramDesc* pdesc,
-               bool is_linear)
-      : scope_(scope),
-        device_(device),
-        program_desc_(pdesc),
-        view_(ProgramDescView::Create(is_linear)) {}
-
-  virtual ~ExecutorImpl() {
-    if (view_) delete view_;
-  }
-
-  void Run() override;
-
-  void Initialize();
-
- private:
-  Scope* scope_;
-  const Device* device_;
-  const ProgramDesc* program_desc_;
-  ProgramDescView* view_;
-};
-
-template <typename T, typename... Args>
-std::unique_ptr<T> make_unique(Args&&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-platform::CPUDeviceContext* GetCPUDeviceContext(
-    const platform::CPUPlace& place) {
-  static std::unique_ptr<platform::CPUDeviceContext> g_cpu_device_context =
-      make_unique<platform::CPUDeviceContext>(place);
-  return g_cpu_device_context.get();
-}
-
-#ifndef PADDLE_ONLY_CPU
-platform::CUDADeviceContext* GetCUDADeviceContext(
-    const platform::GPUPlace& place) {
-  static std::unique_ptr<platform::CUDADeviceContext> g_cuda_device_context =
-      make_unique<platform::CUDADeviceContext>(place);
-  return g_cuda_device_context.get();
-}
-#endif
-
-Device* GetDevice(const platform::Place& place) {
-  platform::CPUPlace cpu_place;
-#ifndef PADDLE_ONLY_CPU
-  if (platform::is_gpu_place(place)) {
-    platform::GPUPlace gpu_place = boost::get<platform::GPUPlace>(place);
-    static std::unique_ptr<Device> g_device = make_unique<Device>(
-        GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place));
-    return g_device.get();
-  } else {
-    static std::unique_ptr<Device> g_device =
-        make_unique<Device>(GetCPUDeviceContext(cpu_place), nullptr);
-    return g_device.get();
-  }
-#else
-  static std::unique_ptr<Device> g_device =
-      make_unique<Device>(GetCPUDeviceContext(cpu_place));
-  return g_device.get();
-#endif
-}
-
-framework::Scope* GetScope() {
-  static std::unique_ptr<framework::Scope> g_scope =
-      make_unique<framework::Scope>();
-  return g_scope.get();
-}
-
-Executor* NewLocalExecutor(const platform::Place& place,
-                           const ProgramDesc& pdesc, bool is_linear) {
-  return new ExecutorImpl(GetScope(), GetDevice(place), &pdesc, is_linear);
-}
-
-void ExecutorImpl::Run() {
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
+                   std::vector<Tensor>* outputs) {
   // operators running
-  scope_->NewVar();
-  device_->cpu_device_context->Wait();
+  Scope& local_scope = scope->NewScope();
+  local_scope.NewVar();
+  for (auto device : devices_) {
+    device->cpu_device_context->Wait();
 #ifndef PADDLE_ONLY_CPU
-  if (device_->cuda_device_context) {
-    device_->cuda_device_context->Wait();
-  }
+    if (device->cuda_device_context) {
+      device->cuda_device_context->Wait();
+    }
 #endif
-}
-
-void ExecutorImpl::Initialize() {
-  // Initialize the ProgramDescView
-  view_->Initialize(program_desc_);
+  }
 }
 
 }  // namespace framework
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index 25ef2d4d48..5d6d7f37a6 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -15,18 +15,22 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/framework.pb.h"
-#include "paddle/platform/place.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device.h"
 
 namespace paddle {
 namespace framework {
 
 class Executor {
  public:
-  virtual ~Executor() {}
-  virtual void Run() = 0;
-};
+  explicit Executor(const std::vector<platform::Place>& places);
+  ~Executor() {}
+  void Run(const ProgramDesc&, Scope*, std::vector<Tensor>*);
 
-Executor* NewLocalExecutor(const platform::Place&, const ProgramDesc&, bool);
+ private:
+  std::vector<platform::Device*> devices_;
+};
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 6f8ca38768..51d2dfc1c3 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -19,9 +19,15 @@ using namespace paddle::platform;
 using namespace paddle::framework;
 
 TEST(Executor, Init) {
+  CPUPlace cpu_place1, cpu_place2;
+  std::vector<Place> places;
+  places.push_back(cpu_place1);
+  places.push_back(cpu_place2);
+  Executor* executor = new Executor(places);
+
   ProgramDesc pdesc;
-  CPUPlace cpu_place;
-  Executor* executor = NewLocalExecutor(cpu_place, pdesc, true);
-  executor->Run();
+  Scope s;
+  std::vector<Tensor>* outputs{nullptr};
+  executor->Run(pdesc, &s, outputs);
   delete executor;
 }
\ No newline at end of file
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index daf519b91d..b581937393 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -23,5 +23,7 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
     system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
 
+cc_library(device SRCS device.cc DEPS device_context)
+
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
diff --git a/paddle/platform/device.cc b/paddle/platform/device.cc
new file mode 100644
index 0000000000..7acd87c8c3
--- /dev/null
+++ b/paddle/platform/device.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device.h"
+
+namespace paddle {
+namespace platform {
+
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+CPUDeviceContext* GetCPUDeviceContext(const CPUPlace& place) {
+  static std::unique_ptr<CPUDeviceContext> g_cpu_device_context =
+      make_unique<CPUDeviceContext>(place);
+  return g_cpu_device_context.get();
+}
+
+#ifndef PADDLE_ONLY_CPU
+CUDADeviceContext* GetCUDADeviceContext(const GPUPlace& place) {
+  static std::unique_ptr<CUDADeviceContext> g_cuda_device_context =
+      make_unique<CUDADeviceContext>(place);
+  return g_cuda_device_context.get();
+}
+#endif
+
+Device* GetDevice(const Place& place) {
+  CPUPlace cpu_place;
+#ifndef PADDLE_ONLY_CPU
+  if (is_gpu_place(place)) {
+    GPUPlace gpu_place = boost::get<GPUPlace>(place);
+    static std::unique_ptr<Device> g_device = make_unique<Device>(
+        GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place));
+    return g_device.get();
+  } else {
+    static std::unique_ptr<Device> g_device =
+        make_unique<Device>(GetCPUDeviceContext(cpu_place), nullptr);
+    return g_device.get();
+  }
+#else
+  static std::unique_ptr<Device> g_device =
+      make_unique<Device>(GetCPUDeviceContext(cpu_place));
+  return g_device.get();
+#endif
+}
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device.h b/paddle/platform/device.h
new file mode 100644
index 0000000000..b1bb8073cf
--- /dev/null
+++ b/paddle/platform/device.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+struct Device {
+  CPUDeviceContext* cpu_device_context;
+#ifndef PADDLE_ONLY_CPU
+  CUDADeviceContext* cuda_device_context;
+#endif
+
+#ifndef PADDLE_ONLY_CPU
+  Device(CPUDeviceContext* cpu, CUDADeviceContext* gpu)
+      : cpu_device_context(cpu), cuda_device_context(gpu) {}
+#else
+  explicit Device(CPUDeviceContext* cpu) : cpu_device_context(cpu) {}
+#endif
+};
+
+CPUDeviceContext* GetCPUDeviceContext(const platform::CPUPlace& place);
+
+#ifndef PADDLE_ONLY_CPU
+CUDADeviceContext* GetCUDADeviceContext(const platform::GPUPlace& place);
+#endif
+
+Device* GetDevice(const platform::Place& place);
+}  // namespace platform
+}  // namespace paddle

From 6e2f96841a5d3e64dc1c4eabb85b7984099b1d0e Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 3 Oct 2017 17:36:29 +0000
Subject: [PATCH 10/61] simple test

---
 paddle/framework/executor.cc      | 30 ++++++++++++++++++------
 paddle/framework/executor_test.cc | 39 ++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index ebe3259bc0..9e7f6f88df 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/executor.h"
 #include <memory>
+#include <vector>
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
@@ -22,6 +23,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// using std::unique_ptr<OperatorBase> op_ptr;
+
 class LinearListView;
 class GraphView;
 
@@ -158,14 +161,27 @@ Executor* NewLocalExecutor(const platform::Place& place,
 }
 
 void ExecutorImpl::Run() {
-  // operators running
-  scope_->NewVar();
-  device_->cpu_device_context->Wait();
-#ifndef PADDLE_ONLY_CPU
-  if (device_->cuda_device_context) {
-    device_->cuda_device_context->Wait();
+  // TODO(tonyyang-svail): only runs the first block
+  auto& block = program_desc_->blocks(0);
+
+  for (auto& var : block.vars()) {
+    scope_->NewVar(var.name());
   }
-#endif
+
+  // std::vector<op_ptr> ops;
+  for (auto& op_desc : block.ops()) {
+    auto op = framework::OpRegistry::CreateOp(op_desc);
+    op->InferShape(device_->cpu_device_context);
+    op->Compute();
+  }
+
+  // TODO(tonyyang-svail): need to test gpu device
+  //   device_->cpu_device_context->Wait();
+  // #ifndef PADDLE_ONLY_CPU
+  //   if (device_->cuda_device_context) {
+  //     device_->cuda_device_context->Wait();
+  //   }
+  // #endif
 }
 
 void ExecutorImpl::Initialize() {
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 6f8ca38768..9ab1b65803 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/executor.h"
+#include "paddle/framework/attribute.h"
+
 #include "gtest/gtest.h"
 
 using namespace paddle::platform;
@@ -20,8 +22,43 @@ using namespace paddle::framework;
 
 TEST(Executor, Init) {
   ProgramDesc pdesc;
+
+  auto root_block = pdesc.add_blocks();
+  root_block->set_idx(0);
+  root_block->set_parent_idx(-1);
+
+  auto a = root_block->add_vars();
+  a->set_name("a");
+  auto a_lt = a->mutable_lod_tensor();
+  a_lt->set_data_type(paddle::framework::DataType::FP32);
+  a_lt->add_dims(640);
+  a_lt->add_dims(640);
+
+  auto b = root_block->add_vars();
+  b->set_name("b");
+  auto b_lt = b->mutable_lod_tensor();
+  b_lt->set_data_type(paddle::framework::DataType::FP32);
+  b_lt->add_dims(640);
+  b_lt->add_dims(640);
+
+  auto c = root_block->add_vars();
+  c->set_name("c");
+  auto c_lt = c->mutable_lod_tensor();
+  c_lt->set_data_type(paddle::framework::DataType::FP32);
+  c_lt->add_dims(640);
+  c_lt->add_dims(640);
+
+  auto op1 = root_block->add_ops();
+  op1->set_type("elementwise_add");
+  auto X = op1->add_inputs();
+  X->set_parameter("X");
+  X->add_arguments("a");
+  auto Y = op1->add_inputs();
+  Y->set_parameter("Y");
+  Y->add_arguments("b");
+
   CPUPlace cpu_place;
   Executor* executor = NewLocalExecutor(cpu_place, pdesc, true);
   executor->Run();
   delete executor;
-}
\ No newline at end of file
+}

From e946fc15192e7a05df42aeea0b4bf1b87fb77472 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 3 Oct 2017 19:42:18 +0000
Subject: [PATCH 11/61] add elementwise_add

---
 paddle/framework/CMakeLists.txt   |  2 +-
 paddle/framework/executor.cc      | 25 +++++++++++++++++++++++++
 paddle/framework/executor.h       |  1 +
 paddle/framework/executor_test.cc |  8 +++++++-
 4 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index cbd39dd095..58e78e9a6a 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -44,7 +44,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device scope framework_proto)
+cc_library(executor SRCS executor.cc DEPS op_registry device scope framework_proto ${GLOB_OP_LIB})
 cc_test(executor_test SRCS executor_test.cc DEPS executor)
 
 cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index a61f0f7162..94b9b3b350 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -31,6 +31,31 @@ Executor::Executor(const std::vector<platform::Place>& places) {
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
                    std::vector<Tensor>* outputs) {
   // operators running
+  // TODO(tonyyang-svail):
+  //    - only runs the first block
+  //    - only runs on the first device
+  auto& block = pdesc.blocks(0);
+  auto& device = devices_[0];
+
+  for (auto& var : block.vars()) {
+    scope->NewVar(var.name());
+  }
+
+  // std::vector<op_ptr> ops;
+  for (auto& op_desc : block.ops()) {
+    auto op = framework::OpRegistry::CreateOp(op_desc);
+    // op->InferShape(*scope);
+    op->Run(*scope, *device->cpu_device_context);
+  }
+
+  // TODO(tonyyang-svail): need to test gpu device
+  //   device_->cpu_device_context->Wait();
+  // #ifndef PADDLE_ONLY_CPU
+  //   if (device_->cuda_device_context) {
+  //     device_->cuda_device_context->Wait();
+  //   }
+  // #endif
+
   Scope& local_scope = scope->NewScope();
   local_scope.NewVar();
   for (auto device : devices_) {
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index 5d6d7f37a6..cdb80bc104 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/op_info.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device.h"
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 4560d6c503..11255af808 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/executor.h"
+#include "gtest/gtest.h"
 #include "paddle/framework/attribute.h"
 
-#include "gtest/gtest.h"
+#include <gtest/gtest.h>
+#include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+USE_OP(elementwise_add);
 
 using namespace paddle::platform;
 using namespace paddle::framework;

From 6c4d1f551d96dda505be54c9a705d5a6784dd062 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 3 Oct 2017 13:43:25 -0700
Subject: [PATCH 12/61] refine codes

---
 paddle/framework/CMakeLists.txt               |   8 +-
 paddle/framework/executor.cc                  |  44 ++++----
 paddle/framework/executor.h                   |   4 +-
 paddle/framework/executor_test.cc             | 103 ++++++++++--------
 paddle/platform/CMakeLists.txt                |   2 +-
 paddle/platform/device.cc                     |  59 ----------
 paddle/platform/device_context_manager.cc     |  68 ++++++++++++
 .../{device.h => device_context_manager.h}    |  45 +++++---
 8 files changed, 188 insertions(+), 145 deletions(-)
 delete mode 100644 paddle/platform/device.cc
 create mode 100644 paddle/platform/device_context_manager.cc
 rename paddle/platform/{device.h => device_context_manager.h} (52%)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 58e78e9a6a..898b3a990d 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -44,8 +44,12 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device scope framework_proto ${GLOB_OP_LIB})
-cc_test(executor_test SRCS executor_test.cc DEPS executor)
+cc_library(executor SRCS executor.cc DEPS op_registry device_context_manager scope framework_proto ${GLOB_OP_LIB})
+if(WITH_GPU)
+    nv_test(executor_test SRCS executor_test.cc DEPS executor)
+else()
+    cc_test(executor_test SRCS executor_test.cc DEPS executor)
+endif()
 
 cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
 cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 94b9b3b350..717f9bf81a 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -22,9 +22,21 @@ namespace paddle {
 namespace framework {
 
 Executor::Executor(const std::vector<platform::Place>& places) {
-  devices_.resize(places.size());
+  device_contexts_.resize(places.size());
   for (size_t i = 0; i < places.size(); i++) {
-    devices_[i] = platform::GetDevice(places[i]);
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_[i] = platform::DeviceContextManager::Get()
+                                ->GetDeviceContext<platform::CPUPlace>(
+                                    boost::get<platform::CPUPlace>(places[i]));
+    } else {
+#ifndef PADDLE_ONLY_CPU
+      device_contexts_[i] = platform::DeviceContextManager::Get()
+                                ->GetDeviceContext<platform::GPUPlace>(
+                                    boost::get<platform::GPUPlace>(places[i]));
+#else
+      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+#endif
+    }
   }
 }
 
@@ -34,37 +46,25 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
   // TODO(tonyyang-svail):
   //    - only runs the first block
   //    - only runs on the first device
+  Scope& local_scope = scope->NewScope();
+
   auto& block = pdesc.blocks(0);
-  auto& device = devices_[0];
+  auto& device_context = device_contexts_[0];
 
   for (auto& var : block.vars()) {
-    scope->NewVar(var.name());
+    local_scope.NewVar(var.name());
   }
 
   // std::vector<op_ptr> ops;
   for (auto& op_desc : block.ops()) {
     auto op = framework::OpRegistry::CreateOp(op_desc);
-    // op->InferShape(*scope);
-    op->Run(*scope, *device->cpu_device_context);
+    // InferShape is now doing inside Run method.
+    op->Run(local_scope, *device_context);
   }
 
   // TODO(tonyyang-svail): need to test gpu device
-  //   device_->cpu_device_context->Wait();
-  // #ifndef PADDLE_ONLY_CPU
-  //   if (device_->cuda_device_context) {
-  //     device_->cuda_device_context->Wait();
-  //   }
-  // #endif
-
-  Scope& local_scope = scope->NewScope();
-  local_scope.NewVar();
-  for (auto device : devices_) {
-    device->cpu_device_context->Wait();
-#ifndef PADDLE_ONLY_CPU
-    if (device->cuda_device_context) {
-      device->cuda_device_context->Wait();
-    }
-#endif
+  for (auto device_context : device_contexts_) {
+    device_context->Wait();
   }
 }
 
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index cdb80bc104..795b8ffdab 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
-#include "paddle/platform/device.h"
+#include "paddle/platform/device_context_manager.h"
 
 namespace paddle {
 namespace framework {
@@ -30,7 +30,7 @@ class Executor {
   void Run(const ProgramDesc&, Scope*, std::vector<Tensor>*);
 
  private:
-  std::vector<platform::Device*> devices_;
+  std::vector<platform::DeviceContext*> device_contexts_;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 11255af808..810ff2a512 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -15,8 +15,6 @@ limitations under the License. */
 #include "paddle/framework/executor.h"
 #include "gtest/gtest.h"
 #include "paddle/framework/attribute.h"
-
-#include <gtest/gtest.h>
 #include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
@@ -26,52 +24,71 @@ USE_OP(elementwise_add);
 using namespace paddle::platform;
 using namespace paddle::framework;
 
-TEST(Executor, Init) {
-  ProgramDesc pdesc;
-
-  auto root_block = pdesc.add_blocks();
-  root_block->set_idx(0);
-  root_block->set_parent_idx(-1);
-
-  auto a = root_block->add_vars();
-  a->set_name("a");
-  auto a_lt = a->mutable_lod_tensor();
-  a_lt->set_data_type(paddle::framework::DataType::FP32);
-  a_lt->add_dims(640);
-  a_lt->add_dims(640);
-
-  auto b = root_block->add_vars();
-  b->set_name("b");
-  auto b_lt = b->mutable_lod_tensor();
-  b_lt->set_data_type(paddle::framework::DataType::FP32);
-  b_lt->add_dims(640);
-  b_lt->add_dims(640);
-
-  auto c = root_block->add_vars();
-  c->set_name("c");
-  auto c_lt = c->mutable_lod_tensor();
-  c_lt->set_data_type(paddle::framework::DataType::FP32);
-  c_lt->add_dims(640);
-  c_lt->add_dims(640);
-
-  auto op1 = root_block->add_ops();
-  op1->set_type("elementwise_add");
-  auto X = op1->add_inputs();
-  X->set_parameter("X");
-  X->add_arguments("a");
-  auto Y = op1->add_inputs();
-  Y->set_parameter("Y");
-  Y->add_arguments("b");
-
-  CPUPlace cpu_place1, cpu_place2;
+class ExecutorTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    auto root_block = pdesc_.add_blocks();
+    root_block->set_idx(0);
+    root_block->set_parent_idx(-1);
+
+    auto a = root_block->add_vars();
+    a->set_name("a");
+    auto a_lt = a->mutable_lod_tensor();
+    a_lt->set_data_type(paddle::framework::DataType::FP32);
+    a_lt->add_dims(640);
+    a_lt->add_dims(640);
+
+    auto b = root_block->add_vars();
+    b->set_name("b");
+    auto b_lt = b->mutable_lod_tensor();
+    b_lt->set_data_type(paddle::framework::DataType::FP32);
+    b_lt->add_dims(640);
+    b_lt->add_dims(640);
+
+    auto c = root_block->add_vars();
+    c->set_name("c");
+    auto c_lt = c->mutable_lod_tensor();
+    c_lt->set_data_type(paddle::framework::DataType::FP32);
+    c_lt->add_dims(640);
+    c_lt->add_dims(640);
+
+    auto op1 = root_block->add_ops();
+    op1->set_type("elementwise_add");
+    auto X = op1->add_inputs();
+    X->set_parameter("X");
+    X->add_arguments("a");
+    auto Y = op1->add_inputs();
+    Y->set_parameter("Y");
+    Y->add_arguments("b");
+  }
+
+ protected:
+  std::vector<Tensor>* outputs_{nullptr};
+  ProgramDesc pdesc_;
+  Scope scope_;
+};
+
+TEST_F(ExecutorTester, InitCPU) {
   std::vector<Place> places;
+  CPUPlace cpu_place1, cpu_place2;
   places.push_back(cpu_place1);
   places.push_back(cpu_place2);
 
   Executor* executor = new Executor(places);
-  Scope s;
-  std::vector<Tensor>* outputs{nullptr};
-  executor->Run(pdesc, &s, outputs);
+  executor->Run(pdesc_, &scope_, outputs_);
+  delete executor;
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST_F(ExecutorTester, InitGPU) {
+  std::vector<Place> places;
+  GPUPlace gpu_place0(0);
+  GPUPlace gpu_place1(1);
+  places.push_back(gpu_place0);
+  places.push_back(gpu_place1);
 
+  Executor* executor = new Executor(places);
+  executor->Run(pdesc_, &scope_, outputs_);
   delete executor;
 }
+#endif
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index b581937393..b4ddf721dd 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -23,7 +23,7 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
     system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
 
-cc_library(device SRCS device.cc DEPS device_context)
+cc_library(device_context_manager SRCS device_context_manager.cc DEPS device_context)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
diff --git a/paddle/platform/device.cc b/paddle/platform/device.cc
deleted file mode 100644
index 7acd87c8c3..0000000000
--- a/paddle/platform/device.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/device.h"
-
-namespace paddle {
-namespace platform {
-
-template <typename T, typename... Args>
-std::unique_ptr<T> make_unique(Args&&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-CPUDeviceContext* GetCPUDeviceContext(const CPUPlace& place) {
-  static std::unique_ptr<CPUDeviceContext> g_cpu_device_context =
-      make_unique<CPUDeviceContext>(place);
-  return g_cpu_device_context.get();
-}
-
-#ifndef PADDLE_ONLY_CPU
-CUDADeviceContext* GetCUDADeviceContext(const GPUPlace& place) {
-  static std::unique_ptr<CUDADeviceContext> g_cuda_device_context =
-      make_unique<CUDADeviceContext>(place);
-  return g_cuda_device_context.get();
-}
-#endif
-
-Device* GetDevice(const Place& place) {
-  CPUPlace cpu_place;
-#ifndef PADDLE_ONLY_CPU
-  if (is_gpu_place(place)) {
-    GPUPlace gpu_place = boost::get<GPUPlace>(place);
-    static std::unique_ptr<Device> g_device = make_unique<Device>(
-        GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place));
-    return g_device.get();
-  } else {
-    static std::unique_ptr<Device> g_device =
-        make_unique<Device>(GetCPUDeviceContext(cpu_place), nullptr);
-    return g_device.get();
-  }
-#else
-  static std::unique_ptr<Device> g_device =
-      make_unique<Device>(GetCPUDeviceContext(cpu_place));
-  return g_device.get();
-#endif
-}
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/device_context_manager.cc b/paddle/platform/device_context_manager.cc
new file mode 100644
index 0000000000..156d317c8a
--- /dev/null
+++ b/paddle/platform/device_context_manager.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context_manager.h"
+
+namespace paddle {
+namespace platform {
+
+DeviceContextManager::DeviceContextManager() {
+#ifndef PADDLE_ONLY_CPU
+  device_count_ = GetDeviceCount();
+  cuda_contexts_.reserve(device_count_);
+  for (int i = 0; i < device_count_; i++) {
+    cuda_contexts_[i] = nullptr;
+  }
+#endif
+}
+
+template <>
+CPUDeviceContext* DeviceContextManager::GetDeviceContext<
+    CPUPlace, CPUDeviceContext>(const CPUPlace& place) {
+  if (!cpu_context_) {
+    cpu_context_ = new CPUDeviceContext(place);
+  }
+  return cpu_context_;
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+CUDADeviceContext* DeviceContextManager::GetDeviceContext<
+    GPUPlace, CUDADeviceContext>(const GPUPlace& place) {
+  int gpu_id = place.device;
+  PADDLE_ENFORCE(gpu_id < device_count_,
+                 "GPU device id must less than device count");
+  SetDeviceId(gpu_id);
+  if (!cuda_contexts_[gpu_id]) {
+    cuda_contexts_[gpu_id] = new CUDADeviceContext(place);
+  }
+  return cuda_contexts_[gpu_id];
+}
+#endif
+
+DeviceContextManager::~DeviceContextManager() {
+  if (cpu_context_) {
+    delete cpu_context_;
+  }
+#ifndef PADDLE_ONLY_CPU
+  for (int i = 0; i < device_count_; i++) {
+    if (cuda_contexts_[i]) {
+      delete cuda_contexts_[i];
+    }
+  }
+#endif
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device.h b/paddle/platform/device_context_manager.h
similarity index 52%
rename from paddle/platform/device.h
rename to paddle/platform/device_context_manager.h
index b1bb8073cf..da15808a60 100644
--- a/paddle/platform/device.h
+++ b/paddle/platform/device_context_manager.h
@@ -13,33 +13,46 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
 #include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
 
 namespace paddle {
 namespace platform {
 
-struct Device {
-  CPUDeviceContext* cpu_device_context;
-#ifndef PADDLE_ONLY_CPU
-  CUDADeviceContext* cuda_device_context;
-#endif
+template <typename T>
+struct Converter;
+
+template <>
+struct Converter<CPUPlace> {
+  using DeviceContextType = CPUDeviceContext;
+};
 
 #ifndef PADDLE_ONLY_CPU
-  Device(CPUDeviceContext* cpu, CUDADeviceContext* gpu)
-      : cpu_device_context(cpu), cuda_device_context(gpu) {}
-#else
-  explicit Device(CPUDeviceContext* cpu) : cpu_device_context(cpu) {}
-#endif
+template <>
+struct Converter<GPUPlace> {
+  using DeviceContextType = CUDADeviceContext;
 };
+#endif
+
+class DeviceContextManager {
+ public:
+  DeviceContextManager();
+  ~DeviceContextManager();
+
+  template <typename PlaceType, typename DeviceType = typename Converter<
+                                    PlaceType>::DeviceContextType>
+  DeviceType* GetDeviceContext(const PlaceType& place);
 
-CPUDeviceContext* GetCPUDeviceContext(const platform::CPUPlace& place);
+  static DeviceContextManager* Get() {
+    static DeviceContextManager inst;
+    return &inst;
+  }
 
+ private:
+  CPUDeviceContext* cpu_context_;
 #ifndef PADDLE_ONLY_CPU
-CUDADeviceContext* GetCUDADeviceContext(const platform::GPUPlace& place);
+  int device_count_;
+  std::vector<CUDADeviceContext*> cuda_contexts_;
 #endif
-
-Device* GetDevice(const platform::Place& place);
+};
 }  // namespace platform
 }  // namespace paddle

From f5e73f4c7e526e10ec8efe4afc4487b8f60e743d Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 3 Oct 2017 23:29:03 +0000
Subject: [PATCH 13/61] pass simple elementwise_add op

---
 paddle/framework/executor.cc      | 36 ++++++++----------
 paddle/framework/executor_test.cc | 63 +++++++++++++++++++++----------
 2 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 94b9b3b350..da387b47ba 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/executor.h"
+#include <iostream>
 #include <memory>
 #include <vector>
+#include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 
@@ -30,41 +32,33 @@ Executor::Executor(const std::vector<platform::Place>& places) {
 
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
                    std::vector<Tensor>* outputs) {
-  // operators running
   // TODO(tonyyang-svail):
   //    - only runs the first block
   //    - only runs on the first device
+  //    - test on gpu
   auto& block = pdesc.blocks(0);
   auto& device = devices_[0];
 
+  // TODO(tonyyang-svail):
+  //    - runs on a new local scope
+  // Scope& local_scope = scope->NewScope();
+
   for (auto& var : block.vars()) {
     scope->NewVar(var.name());
   }
 
-  // std::vector<op_ptr> ops;
   for (auto& op_desc : block.ops()) {
-    auto op = framework::OpRegistry::CreateOp(op_desc);
-    // op->InferShape(*scope);
+    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
     op->Run(*scope, *device->cpu_device_context);
   }
 
-  // TODO(tonyyang-svail): need to test gpu device
-  //   device_->cpu_device_context->Wait();
-  // #ifndef PADDLE_ONLY_CPU
-  //   if (device_->cuda_device_context) {
-  //     device_->cuda_device_context->Wait();
-  //   }
-  // #endif
-
-  Scope& local_scope = scope->NewScope();
-  local_scope.NewVar();
-  for (auto device : devices_) {
-    device->cpu_device_context->Wait();
-#ifndef PADDLE_ONLY_CPU
-    if (device->cuda_device_context) {
-      device->cuda_device_context->Wait();
-    }
-#endif
+  // print tensor value
+  for (auto& var : block.vars()) {
+    std::cout << var.name() << std::endl;
+    auto v = scope->FindVar(var.name());
+    const LoDTensor& t = v->Get<LoDTensor>();
+    for (int i = 0; i < t.numel(); ++i) std::cout << t.data<float>()[i] << " ";
+    std::cout << std::endl;
   }
 }
 
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 11255af808..300de36b87 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -16,16 +16,49 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/framework/attribute.h"
 
-#include <gtest/gtest.h>
 #include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 
+#include <vector>
+
 USE_OP(elementwise_add);
+USE_OP(gaussian_random);
 
 using namespace paddle::platform;
 using namespace paddle::framework;
 
+typedef paddle::framework::BlockDesc proto_block;
+typedef paddle::framework::OpDesc proto_op;
+
+using std::string;
+
+void add_gaussian_random_op(string var_name, proto_block* block) {
+  std::vector<int> dim{2, 3};
+
+  // insert variable
+  auto a = block->add_vars();
+  a->set_name(var_name);
+  auto a_lt = a->mutable_lod_tensor();
+  a_lt->set_data_type(paddle::framework::DataType::FP32);
+  for (int i : dim) {
+    a_lt->add_dims(i);
+  }
+
+  // insert operation
+  auto op = block->add_ops();
+  op->set_type("gaussian_random");
+  auto dims = op->add_attrs();
+  dims->set_name("dims");
+  dims->set_type(paddle::framework::AttrType::INTS);
+  for (int i : dim) {
+    dims->add_ints(i);
+  }
+  auto Out = op->add_outputs();
+  Out->set_parameter("Out");
+  Out->add_arguments(var_name);
+}
+
 TEST(Executor, Init) {
   ProgramDesc pdesc;
 
@@ -33,35 +66,25 @@ TEST(Executor, Init) {
   root_block->set_idx(0);
   root_block->set_parent_idx(-1);
 
-  auto a = root_block->add_vars();
-  a->set_name("a");
-  auto a_lt = a->mutable_lod_tensor();
-  a_lt->set_data_type(paddle::framework::DataType::FP32);
-  a_lt->add_dims(640);
-  a_lt->add_dims(640);
-
-  auto b = root_block->add_vars();
-  b->set_name("b");
-  auto b_lt = b->mutable_lod_tensor();
-  b_lt->set_data_type(paddle::framework::DataType::FP32);
-  b_lt->add_dims(640);
-  b_lt->add_dims(640);
+  add_gaussian_random_op("a", root_block);
+  add_gaussian_random_op("b", root_block);
 
   auto c = root_block->add_vars();
   c->set_name("c");
   auto c_lt = c->mutable_lod_tensor();
   c_lt->set_data_type(paddle::framework::DataType::FP32);
-  c_lt->add_dims(640);
-  c_lt->add_dims(640);
 
-  auto op1 = root_block->add_ops();
-  op1->set_type("elementwise_add");
-  auto X = op1->add_inputs();
+  auto op = root_block->add_ops();
+  op->set_type("elementwise_add");
+  auto X = op->add_inputs();
   X->set_parameter("X");
   X->add_arguments("a");
-  auto Y = op1->add_inputs();
+  auto Y = op->add_inputs();
   Y->set_parameter("Y");
   Y->add_arguments("b");
+  auto Out = op->add_outputs();
+  Out->set_parameter("Out");
+  Out->add_arguments("c");
 
   CPUPlace cpu_place1, cpu_place2;
   std::vector<Place> places;

From 395051512dbaaa8baa4570f8bac10da152bb68ad Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 3 Oct 2017 16:56:50 -0700
Subject: [PATCH 14/61] remove device context manager

---
 paddle/framework/CMakeLists.txt           |  2 +-
 paddle/framework/executor.cc              | 12 ++--
 paddle/framework/executor.h               |  3 +-
 paddle/platform/CMakeLists.txt            |  2 -
 paddle/platform/device_context_manager.cc | 68 -----------------------
 paddle/platform/device_context_manager.h  | 58 -------------------
 6 files changed, 7 insertions(+), 138 deletions(-)
 delete mode 100644 paddle/platform/device_context_manager.cc
 delete mode 100644 paddle/platform/device_context_manager.h

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 898b3a990d..dde96d19e4 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -44,7 +44,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device_context_manager scope framework_proto ${GLOB_OP_LIB})
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto ${GLOB_OP_LIB})
 if(WITH_GPU)
     nv_test(executor_test SRCS executor_test.cc DEPS executor)
 else()
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 717f9bf81a..766945db9b 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -25,14 +25,12 @@ Executor::Executor(const std::vector<platform::Place>& places) {
   device_contexts_.resize(places.size());
   for (size_t i = 0; i < places.size(); i++) {
     if (platform::is_cpu_place(places[i])) {
-      device_contexts_[i] = platform::DeviceContextManager::Get()
-                                ->GetDeviceContext<platform::CPUPlace>(
-                                    boost::get<platform::CPUPlace>(places[i]));
+      device_contexts_[i].reset(new platform::CPUDeviceContext(
+          boost::get<platform::CPUPlace>(places[i])));
     } else {
 #ifndef PADDLE_ONLY_CPU
-      device_contexts_[i] = platform::DeviceContextManager::Get()
-                                ->GetDeviceContext<platform::GPUPlace>(
-                                    boost::get<platform::GPUPlace>(places[i]));
+      device_contexts_[i].reset(new platform::CUDADeviceContext(
+          boost::get<platform::CPUPlace>(places[i])));
 #else
       PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
 #endif
@@ -63,7 +61,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
   }
 
   // TODO(tonyyang-svail): need to test gpu device
-  for (auto device_context : device_contexts_) {
+  for (auto& device_context : device_contexts_) {
     device_context->Wait();
   }
 }
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index 795b8ffdab..d5c21c59fe 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context_manager.h"
 
 namespace paddle {
 namespace framework {
@@ -30,7 +29,7 @@ class Executor {
   void Run(const ProgramDesc&, Scope*, std::vector<Tensor>*);
 
  private:
-  std::vector<platform::DeviceContext*> device_contexts_;
+  std::vector<std::unique_ptr<platform::DeviceContext>> device_contexts_;
 };
 
 }  // namespace framework
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index b4ddf721dd..daf519b91d 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -23,7 +23,5 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
     system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
 
-cc_library(device_context_manager SRCS device_context_manager.cc DEPS device_context)
-
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
diff --git a/paddle/platform/device_context_manager.cc b/paddle/platform/device_context_manager.cc
deleted file mode 100644
index 156d317c8a..0000000000
--- a/paddle/platform/device_context_manager.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/device_context_manager.h"
-
-namespace paddle {
-namespace platform {
-
-DeviceContextManager::DeviceContextManager() {
-#ifndef PADDLE_ONLY_CPU
-  device_count_ = GetDeviceCount();
-  cuda_contexts_.reserve(device_count_);
-  for (int i = 0; i < device_count_; i++) {
-    cuda_contexts_[i] = nullptr;
-  }
-#endif
-}
-
-template <>
-CPUDeviceContext* DeviceContextManager::GetDeviceContext<
-    CPUPlace, CPUDeviceContext>(const CPUPlace& place) {
-  if (!cpu_context_) {
-    cpu_context_ = new CPUDeviceContext(place);
-  }
-  return cpu_context_;
-}
-
-#ifndef PADDLE_ONLY_CPU
-template <>
-CUDADeviceContext* DeviceContextManager::GetDeviceContext<
-    GPUPlace, CUDADeviceContext>(const GPUPlace& place) {
-  int gpu_id = place.device;
-  PADDLE_ENFORCE(gpu_id < device_count_,
-                 "GPU device id must less than device count");
-  SetDeviceId(gpu_id);
-  if (!cuda_contexts_[gpu_id]) {
-    cuda_contexts_[gpu_id] = new CUDADeviceContext(place);
-  }
-  return cuda_contexts_[gpu_id];
-}
-#endif
-
-DeviceContextManager::~DeviceContextManager() {
-  if (cpu_context_) {
-    delete cpu_context_;
-  }
-#ifndef PADDLE_ONLY_CPU
-  for (int i = 0; i < device_count_; i++) {
-    if (cuda_contexts_[i]) {
-      delete cuda_contexts_[i];
-    }
-  }
-#endif
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/device_context_manager.h b/paddle/platform/device_context_manager.h
deleted file mode 100644
index da15808a60..0000000000
--- a/paddle/platform/device_context_manager.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace platform {
-
-template <typename T>
-struct Converter;
-
-template <>
-struct Converter<CPUPlace> {
-  using DeviceContextType = CPUDeviceContext;
-};
-
-#ifndef PADDLE_ONLY_CPU
-template <>
-struct Converter<GPUPlace> {
-  using DeviceContextType = CUDADeviceContext;
-};
-#endif
-
-class DeviceContextManager {
- public:
-  DeviceContextManager();
-  ~DeviceContextManager();
-
-  template <typename PlaceType, typename DeviceType = typename Converter<
-                                    PlaceType>::DeviceContextType>
-  DeviceType* GetDeviceContext(const PlaceType& place);
-
-  static DeviceContextManager* Get() {
-    static DeviceContextManager inst;
-    return &inst;
-  }
-
- private:
-  CPUDeviceContext* cpu_context_;
-#ifndef PADDLE_ONLY_CPU
-  int device_count_;
-  std::vector<CUDADeviceContext*> cuda_contexts_;
-#endif
-};
-}  // namespace platform
-}  // namespace paddle

From fe10e86dd536cc22f65a07a1900bb8b199a8bd5b Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 4 Oct 2017 14:05:37 -0700
Subject: [PATCH 15/61] fix gpu build error

---
 paddle/framework/executor.cc | 19 ++++++++++++++-----
 paddle/framework/executor.h  |  4 ++--
 paddle/platform/gpu_info.cc  |  2 +-
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 03504952ed..aa36b7438f 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -27,12 +27,12 @@ Executor::Executor(const std::vector<platform::Place>& places) {
   device_contexts_.resize(places.size());
   for (size_t i = 0; i < places.size(); i++) {
     if (platform::is_cpu_place(places[i])) {
-      device_contexts_[i].reset(new platform::CPUDeviceContext(
-          boost::get<platform::CPUPlace>(places[i])));
-    } else {
+      device_contexts_[i] = new platform::CPUDeviceContext(
+          boost::get<platform::CPUPlace>(places[i]));
+    } else if (platform::is_gpu_place(places[i])) {
 #ifndef PADDLE_ONLY_CPU
-      device_contexts_[i].reset(new platform::CUDADeviceContext(
-          boost::get<platform::CPUPlace>(places[i])));
+      device_contexts_[i] = new platform::CUDADeviceContext(
+          boost::get<platform::GPUPlace>(places[i]));
 #else
       PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
 #endif
@@ -40,6 +40,14 @@ Executor::Executor(const std::vector<platform::Place>& places) {
   }
 }
 
+Executor::~Executor() {
+  for (auto& device_context : device_contexts_) {
+    if (device_context) {
+      delete device_context;
+    }
+  }
+}
+
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
                    std::vector<Tensor>* outputs) {
   // TODO(tonyyang-svail):
@@ -59,6 +67,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
 
   for (auto& op_desc : block.ops()) {
     auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+    std::cout << op->DebugString() << std::endl;
     op->Run(*scope, *device);
   }
 
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index d5c21c59fe..fc53be37c3 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -25,11 +25,11 @@ namespace framework {
 class Executor {
  public:
   explicit Executor(const std::vector<platform::Place>& places);
-  ~Executor() {}
+  ~Executor();
   void Run(const ProgramDesc&, Scope*, std::vector<Tensor>*);
 
  private:
-  std::vector<std::unique_ptr<platform::DeviceContext>> device_contexts_;
+  std::vector<platform::DeviceContext*> device_contexts_;
 };
 
 }  // namespace framework
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index f487014871..0464797f31 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -43,7 +43,7 @@ int GetCurrentDeviceId() {
 }
 
 void SetDeviceId(int id) {
-  PADDLE_ENFORCE(id < GetDeviceCount(), "id must less than GPU count")
+  PADDLE_ENFORCE(id < GetDeviceCount(), "id must less than GPU count");
   PADDLE_ENFORCE(cudaSetDevice(id),
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }

From 3014f6a1135e113cb55a6a2cb771d477502a8b00 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 4 Oct 2017 17:36:19 -0700
Subject: [PATCH 16/61] correct macro

---
 paddle/framework/executor.cc      | 2 +-
 paddle/framework/executor_test.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index aa36b7438f..7c3cac359e 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -30,7 +30,7 @@ Executor::Executor(const std::vector<platform::Place>& places) {
       device_contexts_[i] = new platform::CPUDeviceContext(
           boost::get<platform::CPUPlace>(places[i]));
     } else if (platform::is_gpu_place(places[i])) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
       device_contexts_[i] = new platform::CUDADeviceContext(
           boost::get<platform::GPUPlace>(places[i]));
 #else
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index f746242a6b..ca7e8ca7d2 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -103,7 +103,7 @@ TEST_F(ExecutorTester, InitCPU) {
   delete executor;
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST_F(ExecutorTester, InitGPU) {
   std::vector<Place> places;
   GPUPlace gpu_place0(0);

From 623848afa1f0bb3a69c7e49c4fa0f763a252669d Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 5 Oct 2017 12:11:56 -0700
Subject: [PATCH 17/61] add feed operator

---
 paddle/framework/scope.cc         | 16 ++++++++++
 paddle/framework/scope.h          |  2 ++
 paddle/operators/activation_op.cu | 18 +++++------
 paddle/operators/feed_op.cc       | 52 +++++++++++++++++++++++++++++++
 paddle/operators/feed_op.cu       | 18 +++++++++++
 paddle/operators/feed_op.h        | 40 ++++++++++++++++++++++++
 6 files changed, 137 insertions(+), 9 deletions(-)
 create mode 100644 paddle/operators/feed_op.cc
 create mode 100644 paddle/operators/feed_op.cu
 create mode 100644 paddle/operators/feed_op.h

diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 080b4ac621..b04120abf2 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/scope.h"
+#include <memory>  // for unique_ptr
+#include <mutex>   // for call_once
 #include "paddle/string/printf.h"
 
 namespace paddle {
@@ -62,5 +64,19 @@ void Scope::DropKids() {
   kids_.clear();
 }
 
+std::once_flag feed_variable_flag;
+
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+framework::Scope* GetScope() {
+  static std::unique_ptr<framework::Scope> g_scope =
+      make_unique<framework::Scope>();
+  std::call_once(feed_variable_flag, [&]() { g_scope->NewVar("feed_value"); });
+  return g_scope.get();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 7047f0d55e..96f3ae875b 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -73,5 +73,7 @@ class Scope {
   DISABLE_COPY_AND_ASSIGN(Scope);
 };
 
+framework::Scope* GetScope();
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 93e9f1c694..44a6aaf9cb 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/activation_op.h"
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
new file mode 100644
index 0000000000..805c3600be
--- /dev/null
+++ b/paddle/operators/feed_op.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/feed_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FeedOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    typedef std::vector<framework::Tensor> FeedInputs;
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null.");
+    int col = ctx->Attrs().Get<int>("col");
+    framework::Variable* g_feed_variable =
+        framework::GetScope()->FindVar("feed_value");
+    FeedInputs tensors = g_feed_variable->Get<FeedInputs>();
+    auto in_dim = tensors[col].dims();
+    ctx->SetOutputDim("Y", in_dim);
+    // need to handle LodTensor later
+  }
+};
+
+class FeedOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FeedOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<int>("col", "The col in Global Feed Variable");
+    AddOutput("Out", "The output of dropout op.");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(feed, ops::FeedOp, ops::FeedOpMaker);
+REGISTER_OP_CPU_KERNEL(feed, ops::FeedKernel<float>);
diff --git a/paddle/operators/feed_op.cu b/paddle/operators/feed_op.cu
new file mode 100644
index 0000000000..7b6a2ac91e
--- /dev/null
+++ b/paddle/operators/feed_op.cu
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/feed_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(feed, ops::FeedKernel<float>);
diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h
new file mode 100644
index 0000000000..57781e205f
--- /dev/null
+++ b/paddle/operators/feed_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class FeedKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    typedef std::vector<framework::Tensor> FeedInputs;
+    Tensor* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    framework::Variable* g_feed_variable =
+        framework::GetScope()->FindVar("feed_value");
+    int col = ctx.template Attr<int>("col");
+    FeedInputs tensors = g_feed_variable->Get<FeedInputs>();
+    out->CopyFrom<T>(tensors[col], ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

From 20725f2d52bd3f6d54df45c710872b9b8ee52e14 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 5 Oct 2017 14:55:29 -0700
Subject: [PATCH 18/61] add executor feed operator test

---
 paddle/framework/executor.cc      |  20 ++--
 paddle/framework/executor.h       |   2 +-
 paddle/framework/executor_test.cc | 155 +++++++++++++++++++++++++++---
 paddle/operators/feed_op.cc       |  15 ++-
 4 files changed, 167 insertions(+), 25 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 7c3cac359e..aafef12554 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -48,8 +48,7 @@ Executor::~Executor() {
   }
 }
 
-void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
-                   std::vector<Tensor>* outputs) {
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
   // TODO(tonyyang-svail):
   //    - only runs the first block
   //    - only runs on the first device
@@ -76,14 +75,15 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
     device_context->Wait();
   }
   // // print tensor value
-  // for (auto& var : block.vars()) {
-  //   std::cout << var.name() << std::endl;
-  //   auto v = scope->FindVar(var.name());
-  //   const LoDTensor& t = v->Get<LoDTensor>();
-  //   for (int i = 0; i < t.numel(); ++i)
-  //     std::cout << t.data<float>()[i] << " ";
-  //   std::cout << std::endl;
-  // }
+  for (auto& var : block.vars()) {
+    std::cout << var.name() << std::endl;
+    auto v = scope->FindVar(var.name());
+    const LoDTensor& t = v->Get<LoDTensor>();
+    for (int i = 0; i < t.numel(); ++i) {
+      std::cout << t.data<float>()[i] << " ";
+    }
+    std::cout << std::endl;
+  }
 }
 
 }  // namespace framework
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index fc53be37c3..9e443c8fca 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -26,7 +26,7 @@ class Executor {
  public:
   explicit Executor(const std::vector<platform::Place>& places);
   ~Executor();
-  void Run(const ProgramDesc&, Scope*, std::vector<Tensor>*);
+  void Run(const ProgramDesc&, Scope*);
 
  private:
   std::vector<platform::DeviceContext*> device_contexts_;
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index ca7e8ca7d2..0856d1f32e 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -13,17 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/executor.h"
+#include <memory>  // for unique_ptr
+#include <mutex>   // for call_once
+#include <vector>
 #include "gtest/gtest.h"
 #include "paddle/framework/attribute.h"
-
 #include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 
-#include <vector>
-
 USE_OP(elementwise_add);
 USE_OP(gaussian_random);
+USE_OP(feed);
 
 using std::string;
 using namespace paddle::platform;
@@ -58,7 +59,67 @@ void add_gaussian_random_op(string var_name, proto_block* block) {
   Out->add_arguments(var_name);
 }
 
-class ExecutorTester : public ::testing::Test {
+void add_feed_op(string var_name, int index, proto_block* block) {
+  std::vector<int> dim{3};
+
+  // insert variable
+  auto a = block->add_vars();
+  a->set_name(var_name);
+  auto a_lt = a->mutable_lod_tensor();
+  a_lt->set_data_type(paddle::framework::DataType::FP32);
+  for (int i : dim) {
+    a_lt->add_dims(i);
+  }
+
+  // insert operation
+  auto op = block->add_ops();
+  op->set_type("feed");
+
+  // set dims attr
+  auto dims = op->add_attrs();
+  dims->set_name("dims");
+  dims->set_type(paddle::framework::AttrType::INTS);
+  for (int i : dim) {
+    dims->add_ints(i);
+  }
+
+  // set col attr
+  auto col = op->add_attrs();
+  col->set_name("col");
+  col->set_type(paddle::framework::AttrType::INT);
+  col->set_i(index);
+
+  auto Out = op->add_outputs();
+  Out->set_parameter("Out");
+  Out->add_arguments(var_name);
+}
+
+std::once_flag set_variable_flag;
+
+template <typename T>
+void set_feed_variable(const std::vector<std::vector<T>>& inputs) {
+  typedef std::vector<paddle::framework::Tensor> FeedInputs;
+  Variable* g_feed_value = GetScope()->FindVar("feed_value");
+  FeedInputs& feed_inputs = *(g_feed_value->GetMutable<FeedInputs>());
+  auto size = inputs.size();
+
+  std::call_once(set_variable_flag, [&]() {
+    feed_inputs.reserve(size);
+    for (size_t i = 0; i < size; i++) {
+      paddle::framework::Tensor tmp;
+      tmp.mutable_data<T>(make_ddim({static_cast<int64_t>(inputs[i].size())}),
+                          CPUPlace());
+      feed_inputs.push_back(tmp);
+    }
+  });
+
+  for (size_t i = 0; i < size; i++) {
+    memcpy(feed_inputs[i].data<T>(), inputs[i].data(),
+           inputs[i].size() * sizeof(T));
+  }
+}
+
+class ExecutorTesterRandom : public ::testing::Test {
  public:
   virtual void SetUp() override {
     auto root_block = pdesc_.add_blocks();
@@ -84,33 +145,103 @@ class ExecutorTester : public ::testing::Test {
     auto Out = op->add_outputs();
     Out->set_parameter("Out");
     Out->add_arguments("c");
+
+    scope_ = GetScope();
   }
 
  protected:
-  std::vector<Tensor>* outputs_{nullptr};
   ProgramDesc pdesc_;
-  Scope scope_;
+  Scope* scope_;
 };
 
-TEST_F(ExecutorTester, InitCPU) {
+class ExecutorTesterFeed : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    auto root_block = pdesc_.add_blocks();
+    root_block->set_idx(0);
+    root_block->set_parent_idx(-1);
+
+    add_feed_op("a", 0, root_block);
+    add_feed_op("b", 1, root_block);
+
+    auto c = root_block->add_vars();
+    c->set_name("c");
+    auto c_lt = c->mutable_lod_tensor();
+    c_lt->set_data_type(paddle::framework::DataType::FP32);
+
+    auto op = root_block->add_ops();
+    op->set_type("elementwise_add");
+    auto X = op->add_inputs();
+    X->set_parameter("X");
+    X->add_arguments("a");
+    auto Y = op->add_inputs();
+    Y->set_parameter("Y");
+    Y->add_arguments("b");
+    auto Out = op->add_outputs();
+    Out->set_parameter("Out");
+    Out->add_arguments("c");
+
+    std::vector<float> vec1 = {1.0, 2.0, 3.0};
+    std::vector<float> vec2 = {4.0, 5.0, 6.0};
+    inputs_.push_back(vec1);
+    inputs_.push_back(vec2);
+  }
+
+ protected:
+  ProgramDesc pdesc_;
+  std::vector<std::vector<float>> inputs_;
+};
+
+TEST_F(ExecutorTesterRandom, CPU) {
   std::vector<Place> places;
   CPUPlace cpu_place1, cpu_place2;
   places.push_back(cpu_place1);
   places.push_back(cpu_place2);
 
   Executor* executor = new Executor(places);
-  executor->Run(pdesc_, &scope_, outputs_);
+  executor->Run(pdesc_, scope_);
+  delete executor;
+}
+
+TEST_F(ExecutorTesterFeed, CPU) {
+  std::vector<Place> places;
+  CPUPlace cpu_place;
+  places.push_back(cpu_place);
+
+  Executor* executor = new Executor(places);
+
+  // 3 mini-batch
+  for (int i = 0; i < 3; i++) {
+    // need to set feed variable before Executor::Run
+    set_feed_variable<float>(inputs_);
+    executor->Run(pdesc_, GetScope());
+  }
+
   delete executor;
 }
 
 #ifdef PADDLE_WITH_GPU
-TEST_F(ExecutorTester, InitGPU) {
+TEST_F(ExecutorTesterRandom, GPU) {
+  std::vector<Place> places;
+  GPUPlace gpu_place(0);
+  places.push_back(gpu_place);
+
+  Executor* executor = new Executor(places);
+  executor->Run(pdesc_, scope_);
+  delete executor;
+}
+
+TEST_F(ExecutorTesterFeed, GPU) {
   std::vector<Place> places;
-  GPUPlace gpu_place0(0);
-  places.push_back(gpu_place0);
+  GPUPlace gpu_place(0);
+  places.push_back(gpu_place);
 
   Executor* executor = new Executor(places);
-  executor->Run(pdesc_, &scope_, outputs_);
+
+  // need to set feed variable before Executor::Run
+  set_feed_variable<float>(inputs_);
+  executor->Run(pdesc_, scope_);
+
   delete executor;
 }
 #endif
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 805c3600be..5ae882bc8a 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -28,19 +28,30 @@ class FeedOp : public framework::OperatorWithKernel {
     int col = ctx->Attrs().Get<int>("col");
     framework::Variable* g_feed_variable =
         framework::GetScope()->FindVar("feed_value");
+
     FeedInputs tensors = g_feed_variable->Get<FeedInputs>();
+
     auto in_dim = tensors[col].dims();
-    ctx->SetOutputDim("Y", in_dim);
+    ctx->SetOutputDim("Out", in_dim);
     // need to handle LodTensor later
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return static_cast<framework::DataType>(Attr<int>("data_type"));
+  }
 };
 
 class FeedOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   FeedOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("col", "The col in Global Feed Variable");
+    AddAttr<int>("data_type", "output data type")
+        .SetDefault(framework::DataType::FP32);
+    AddAttr<int>("col", "The col in global feed variable").SetDefault(0);
+    AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
     AddOutput("Out", "The output of dropout op.");
+    AddComment(R"DOC(Feed data to global feed variable)DOC");
   }
 };
 

From 45c4dcaabb4cbf140384dcffe3392d2e10b2a6d7 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 5 Oct 2017 15:54:44 -0700
Subject: [PATCH 19/61] add fetch operator

---
 paddle/framework/executor.cc      | 18 ++++----
 paddle/framework/executor_test.cc | 67 ++++++++++++++++++++++++++++++
 paddle/framework/scope.cc         |  5 ++-
 paddle/operators/activation_op.cu | 18 ++++----
 paddle/operators/feed_op.cc       |  6 +--
 paddle/operators/fetch_op.cc      | 68 +++++++++++++++++++++++++++++++
 paddle/operators/fetch_op.cu      | 18 ++++++++
 paddle/operators/fetch_op.h       | 40 ++++++++++++++++++
 8 files changed, 218 insertions(+), 22 deletions(-)
 create mode 100644 paddle/operators/fetch_op.cc
 create mode 100644 paddle/operators/fetch_op.cu
 create mode 100644 paddle/operators/fetch_op.h

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index aafef12554..51ddb7e58e 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -75,15 +75,15 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
     device_context->Wait();
   }
   // // print tensor value
-  for (auto& var : block.vars()) {
-    std::cout << var.name() << std::endl;
-    auto v = scope->FindVar(var.name());
-    const LoDTensor& t = v->Get<LoDTensor>();
-    for (int i = 0; i < t.numel(); ++i) {
-      std::cout << t.data<float>()[i] << " ";
-    }
-    std::cout << std::endl;
-  }
+  // for (auto& var : block.vars()) {
+  //   std::cout << var.name() << std::endl;
+  //   auto v = scope->FindVar(var.name());
+  //   const LoDTensor& t = v->Get<LoDTensor>();
+  //   for (int i = 0; i < t.numel(); ++i) {
+  //     std::cout << t.data<float>()[i] << " ";
+  //   }
+  //   std::cout << std::endl;
+  // }
 }
 
 }  // namespace framework
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 0856d1f32e..980f5f579c 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 USE_OP(elementwise_add);
 USE_OP(gaussian_random);
 USE_OP(feed);
+USE_OP(fetch);
 
 using std::string;
 using namespace paddle::platform;
@@ -94,6 +95,41 @@ void add_feed_op(string var_name, int index, proto_block* block) {
   Out->add_arguments(var_name);
 }
 
+void add_fetch_op(string var_name, int index, proto_block* block) {
+  std::vector<int> dim{3};
+
+  // insert variable
+  auto a = block->add_vars();
+  a->set_name(var_name);
+  auto a_lt = a->mutable_lod_tensor();
+  a_lt->set_data_type(paddle::framework::DataType::FP32);
+  for (int i : dim) {
+    a_lt->add_dims(i);
+  }
+
+  // insert operation
+  auto op = block->add_ops();
+  op->set_type("fetch");
+
+  // set dims attr
+  auto dims = op->add_attrs();
+  dims->set_name("dims");
+  dims->set_type(paddle::framework::AttrType::INTS);
+  for (int i : dim) {
+    dims->add_ints(i);
+  }
+
+  // set col attr
+  auto col = op->add_attrs();
+  col->set_name("col");
+  col->set_type(paddle::framework::AttrType::INT);
+  col->set_i(index);
+
+  auto Out = op->add_inputs();
+  Out->set_parameter("Input");
+  Out->add_arguments(var_name);
+}
+
 std::once_flag set_variable_flag;
 
 template <typename T>
@@ -119,6 +155,27 @@ void set_feed_variable(const std::vector<std::vector<T>>& inputs) {
   }
 }
 
+template <typename T>
+std::vector<std::vector<T>> get_fetch_variable() {
+  typedef std::vector<paddle::framework::Tensor> FetchOutputs;
+  Variable* g_fetch_value = GetScope()->FindVar("fetch_value");
+  FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable<FetchOutputs>());
+  auto size = fetch_outputs.size();
+
+  std::vector<std::vector<T>> result;
+  result.reserve(size);
+
+  for (size_t i = 0; i < size; i++) {
+    std::vector<T> tmp;
+    tmp.reserve(fetch_outputs[i].numel());
+    memcpy(tmp.data(), fetch_outputs[i].data<T>(),
+           fetch_outputs[i].numel() * sizeof(T));
+    result.push_back(tmp);
+  }
+
+  return result;
+}
+
 class ExecutorTesterRandom : public ::testing::Test {
  public:
   virtual void SetUp() override {
@@ -181,6 +238,8 @@ class ExecutorTesterFeed : public ::testing::Test {
     Out->set_parameter("Out");
     Out->add_arguments("c");
 
+    add_fetch_op("c", 0, root_block);
+
     std::vector<float> vec1 = {1.0, 2.0, 3.0};
     std::vector<float> vec2 = {4.0, 5.0, 6.0};
     inputs_.push_back(vec1);
@@ -213,8 +272,16 @@ TEST_F(ExecutorTesterFeed, CPU) {
   // 3 mini-batch
   for (int i = 0; i < 3; i++) {
     // need to set feed variable before Executor::Run
+    std::cout << "start mini-batch " << i << std::endl;
     set_feed_variable<float>(inputs_);
     executor->Run(pdesc_, GetScope());
+    std::vector<std::vector<float>> result = get_fetch_variable<float>();
+    for (auto& vec : result) {
+      for (auto& num : vec) {
+        std::cout << num << " ";
+      }
+      std::cout << std::endl;
+    }
   }
 
   delete executor;
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index b04120abf2..2c416570cf 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -74,7 +74,10 @@ std::unique_ptr<T> make_unique(Args&&... args) {
 framework::Scope* GetScope() {
   static std::unique_ptr<framework::Scope> g_scope =
       make_unique<framework::Scope>();
-  std::call_once(feed_variable_flag, [&]() { g_scope->NewVar("feed_value"); });
+  std::call_once(feed_variable_flag, [&]() {
+    g_scope->NewVar("feed_value");
+    g_scope->NewVar("fetch_value");
+  });
   return g_scope.get();
 }
 
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 44a6aaf9cb..93e9f1c694 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/activation_op.h"
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 5ae882bc8a..a61855cb99 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -49,9 +49,9 @@ class FeedOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("data_type", "output data type")
         .SetDefault(framework::DataType::FP32);
     AddAttr<int>("col", "The col in global feed variable").SetDefault(0);
-    AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
-    AddOutput("Out", "The output of dropout op.");
-    AddComment(R"DOC(Feed data to global feed variable)DOC");
+    AddAttr<std::vector<int>>("dims", "The dimension of feed tensor.");
+    AddOutput("Out", "The output of feed op.");
+    AddComment(R"DOC(Feed data from global feed variable)DOC");
   }
 };
 
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
new file mode 100644
index 0000000000..68e8d26dbe
--- /dev/null
+++ b/paddle/operators/fetch_op.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fetch_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FetchOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    typedef std::vector<framework::Tensor> FetchOutputs;
+    PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null.");
+    int col = ctx->Attrs().Get<int>("col");
+    framework::Variable* g_fetch_variable =
+        framework::GetScope()->FindVar("fetch_value");
+
+    FetchOutputs* tensors = g_fetch_variable->GetMutable<FetchOutputs>();
+    if (tensors->size() < col) {
+      tensors->resize(col);
+    }
+
+    auto input_dim = ctx->GetInputDim("Input");
+    framework::Tensor tmp;
+    tmp.Resize(input_dim);
+    (*tensors)[col].Resize(input_dim);
+    // need to handle LodTensor later
+  }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return static_cast<framework::DataType>(Attr<int>("data_type"));
+  }
+};
+
+class FetchOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FetchOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<int>("data_type", "output data type")
+        .SetDefault(framework::DataType::FP32);
+    AddAttr<int>("col", "The col in global fetch variable").SetDefault(0);
+    AddAttr<std::vector<int>>("dims", "The dimension of fetch tensor.");
+    AddInput("Input", "The output of fetch op.");
+    AddComment(R"DOC(Fetch data to global fetch variable)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(fetch, ops::FetchOp, ops::FetchOpMaker);
+REGISTER_OP_CPU_KERNEL(fetch, ops::FetchKernel<float>);
diff --git a/paddle/operators/fetch_op.cu b/paddle/operators/fetch_op.cu
new file mode 100644
index 0000000000..2e24d3a8ad
--- /dev/null
+++ b/paddle/operators/fetch_op.cu
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/feed_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(fetch, ops::FetchKernel<float>);
diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h
new file mode 100644
index 0000000000..95e7986a22
--- /dev/null
+++ b/paddle/operators/fetch_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class FetchKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    typedef std::vector<framework::Tensor> FetchOutputs;
+    Tensor* input = ctx.Output<Tensor>("Input");
+    int col = ctx.template Attr<int>("col");
+    framework::Variable* g_fetch_variable =
+        framework::GetScope()->FindVar("fetch_value");
+    FetchOutputs tensors = g_fetch_variable->Get<FetchOutputs>();
+    tensors[col].mutable_data<T>(platform::CPUPlace());
+    tensors[col].CopyFrom<T>(*input, platform::CPUPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

From 48b080db9fcc4f34535c98878112e6633d6d8d7d Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 5 Oct 2017 20:48:04 -0700
Subject: [PATCH 20/61] ensure global BuddyAllocator is initialized before
 global Scope

---
 paddle/framework/executor_test.cc | 94 +++++++++++++++++--------------
 paddle/operators/feed_op.cc       |  4 +-
 paddle/operators/feed_op.h        |  2 +-
 paddle/operators/fetch_op.cc      |  7 ++-
 paddle/operators/fetch_op.h       |  8 +--
 5 files changed, 62 insertions(+), 53 deletions(-)

diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 980f5f579c..d3ea18d154 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/executor.h"
-#include <memory>  // for unique_ptr
-#include <mutex>   // for call_once
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/framework/attribute.h"
@@ -34,9 +32,8 @@ using namespace paddle::framework;
 typedef paddle::framework::BlockDesc proto_block;
 typedef paddle::framework::OpDesc proto_op;
 
-void add_gaussian_random_op(string var_name, proto_block* block) {
-  std::vector<int> dim{2, 3};
-
+void add_gaussian_random_op(string var_name, std::vector<int>& dim,
+                            proto_block* block) {
   // insert variable
   auto a = block->add_vars();
   a->set_name(var_name);
@@ -60,9 +57,8 @@ void add_gaussian_random_op(string var_name, proto_block* block) {
   Out->add_arguments(var_name);
 }
 
-void add_feed_op(string var_name, int index, proto_block* block) {
-  std::vector<int> dim{3};
-
+void add_feed_op(string var_name, std::vector<int>& dim, int index,
+                 proto_block* block) {
   // insert variable
   auto a = block->add_vars();
   a->set_name(var_name);
@@ -95,9 +91,8 @@ void add_feed_op(string var_name, int index, proto_block* block) {
   Out->add_arguments(var_name);
 }
 
-void add_fetch_op(string var_name, int index, proto_block* block) {
-  std::vector<int> dim{3};
-
+void add_fetch_op(string var_name, std::vector<int>& dim, int index,
+                  proto_block* block) {
   // insert variable
   auto a = block->add_vars();
   a->set_name(var_name);
@@ -138,20 +133,11 @@ void set_feed_variable(const std::vector<std::vector<T>>& inputs) {
   Variable* g_feed_value = GetScope()->FindVar("feed_value");
   FeedInputs& feed_inputs = *(g_feed_value->GetMutable<FeedInputs>());
   auto size = inputs.size();
-
-  std::call_once(set_variable_flag, [&]() {
-    feed_inputs.reserve(size);
-    for (size_t i = 0; i < size; i++) {
-      paddle::framework::Tensor tmp;
-      tmp.mutable_data<T>(make_ddim({static_cast<int64_t>(inputs[i].size())}),
-                          CPUPlace());
-      feed_inputs.push_back(tmp);
-    }
-  });
-
+  feed_inputs.resize(size);
   for (size_t i = 0; i < size; i++) {
-    memcpy(feed_inputs[i].data<T>(), inputs[i].data(),
-           inputs[i].size() * sizeof(T));
+    T* dst = feed_inputs[i].mutable_data<T>(
+        make_ddim({static_cast<int64_t>(inputs[i].size())}), CPUPlace());
+    memcpy(dst, inputs[i].data(), inputs[i].size() * sizeof(T));
   }
 }
 
@@ -160,19 +146,17 @@ std::vector<std::vector<T>> get_fetch_variable() {
   typedef std::vector<paddle::framework::Tensor> FetchOutputs;
   Variable* g_fetch_value = GetScope()->FindVar("fetch_value");
   FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable<FetchOutputs>());
-  auto size = fetch_outputs.size();
 
+  auto size = fetch_outputs.size();
   std::vector<std::vector<T>> result;
   result.reserve(size);
-
   for (size_t i = 0; i < size; i++) {
     std::vector<T> tmp;
-    tmp.reserve(fetch_outputs[i].numel());
+    tmp.resize(fetch_outputs[i].numel());
     memcpy(tmp.data(), fetch_outputs[i].data<T>(),
            fetch_outputs[i].numel() * sizeof(T));
     result.push_back(tmp);
   }
-
   return result;
 }
 
@@ -183,8 +167,9 @@ class ExecutorTesterRandom : public ::testing::Test {
     root_block->set_idx(0);
     root_block->set_parent_idx(-1);
 
-    add_gaussian_random_op("a", root_block);
-    add_gaussian_random_op("b", root_block);
+    std::vector<int> dim{2, 3};
+    add_gaussian_random_op("a", dim, root_block);
+    add_gaussian_random_op("b", dim, root_block);
 
     auto c = root_block->add_vars();
     c->set_name("c");
@@ -203,12 +188,11 @@ class ExecutorTesterRandom : public ::testing::Test {
     Out->set_parameter("Out");
     Out->add_arguments("c");
 
-    scope_ = GetScope();
+    add_fetch_op("c", dim, 0, root_block);
   }
 
  protected:
   ProgramDesc pdesc_;
-  Scope* scope_;
 };
 
 class ExecutorTesterFeed : public ::testing::Test {
@@ -218,8 +202,10 @@ class ExecutorTesterFeed : public ::testing::Test {
     root_block->set_idx(0);
     root_block->set_parent_idx(-1);
 
-    add_feed_op("a", 0, root_block);
-    add_feed_op("b", 1, root_block);
+    std::vector<int> dim{6};
+
+    add_feed_op("a", dim, 0, root_block);
+    add_feed_op("b", dim, 1, root_block);
 
     auto c = root_block->add_vars();
     c->set_name("c");
@@ -238,10 +224,10 @@ class ExecutorTesterFeed : public ::testing::Test {
     Out->set_parameter("Out");
     Out->add_arguments("c");
 
-    add_fetch_op("c", 0, root_block);
+    add_fetch_op("c", dim, 0, root_block);
 
-    std::vector<float> vec1 = {1.0, 2.0, 3.0};
-    std::vector<float> vec2 = {4.0, 5.0, 6.0};
+    std::vector<float> vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    std::vector<float> vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
     inputs_.push_back(vec1);
     inputs_.push_back(vec2);
   }
@@ -253,12 +239,24 @@ class ExecutorTesterFeed : public ::testing::Test {
 
 TEST_F(ExecutorTesterRandom, CPU) {
   std::vector<Place> places;
-  CPUPlace cpu_place1, cpu_place2;
-  places.push_back(cpu_place1);
-  places.push_back(cpu_place2);
+  CPUPlace cpu_place;
+  places.push_back(cpu_place);
+
+  // We have a global Scope and BuddyAllocator, and we must ensure
+  // global BuddyAllocator is initialized before global Scope. Thus,
+  // global Scope will deconstruct before BuddyAllocator. Otherwise,
+  // "pointer being freed was not allocated" error will appear.
+  paddle::memory::Used(cpu_place);
 
   Executor* executor = new Executor(places);
-  executor->Run(pdesc_, scope_);
+  executor->Run(pdesc_, GetScope());
+  std::vector<std::vector<float>> result = get_fetch_variable<float>();
+  for (auto& vec : result) {
+    for (auto& num : vec) {
+      std::cout << num << " ";
+    }
+    std::cout << std::endl;
+  }
   delete executor;
 }
 
@@ -267,6 +265,12 @@ TEST_F(ExecutorTesterFeed, CPU) {
   CPUPlace cpu_place;
   places.push_back(cpu_place);
 
+  // We have a global Scope and BuddyAllocator, and we must ensure
+  // global BuddyAllocator is initialized before global Scope. Thus,
+  // global Scope will deconstruct before BuddyAllocator. Otherwise,
+  // "pointer being freed was not allocated" error will appear.
+  paddle::memory::Used(cpu_place);
+
   Executor* executor = new Executor(places);
 
   // 3 mini-batch
@@ -293,8 +297,10 @@ TEST_F(ExecutorTesterRandom, GPU) {
   GPUPlace gpu_place(0);
   places.push_back(gpu_place);
 
+  paddle::memory::Used(gpu_place);
+
   Executor* executor = new Executor(places);
-  executor->Run(pdesc_, scope_);
+  executor->Run(pdesc_, GetScope());
   delete executor;
 }
 
@@ -303,11 +309,13 @@ TEST_F(ExecutorTesterFeed, GPU) {
   GPUPlace gpu_place(0);
   places.push_back(gpu_place);
 
+  paddle::memory::Used(gpu_place);
+
   Executor* executor = new Executor(places);
 
   // need to set feed variable before Executor::Run
   set_feed_variable<float>(inputs_);
-  executor->Run(pdesc_, scope_);
+  executor->Run(pdesc_, GetScope());
 
   delete executor;
 }
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index a61855cb99..d40db3ff2e 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -29,11 +29,11 @@ class FeedOp : public framework::OperatorWithKernel {
     framework::Variable* g_feed_variable =
         framework::GetScope()->FindVar("feed_value");
 
-    FeedInputs tensors = g_feed_variable->Get<FeedInputs>();
+    const FeedInputs& tensors = g_feed_variable->Get<FeedInputs>();
 
     auto in_dim = tensors[col].dims();
     ctx->SetOutputDim("Out", in_dim);
-    // need to handle LodTensor later
+    // TODO(qijun) need to handle LodTensor later
   }
 
   framework::DataType IndicateDataType(
diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h
index 57781e205f..cf93b6f434 100644
--- a/paddle/operators/feed_op.h
+++ b/paddle/operators/feed_op.h
@@ -31,7 +31,7 @@ class FeedKernel : public framework::OpKernel<T> {
     framework::Variable* g_feed_variable =
         framework::GetScope()->FindVar("feed_value");
     int col = ctx.template Attr<int>("col");
-    FeedInputs tensors = g_feed_variable->Get<FeedInputs>();
+    const FeedInputs& tensors = g_feed_variable->Get<FeedInputs>();
     out->CopyFrom<T>(tensors[col], ctx.GetPlace());
   }
 };
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 68e8d26dbe..a885deacc8 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -30,15 +30,16 @@ class FetchOp : public framework::OperatorWithKernel {
         framework::GetScope()->FindVar("fetch_value");
 
     FetchOutputs* tensors = g_fetch_variable->GetMutable<FetchOutputs>();
-    if (tensors->size() < col) {
-      tensors->resize(col);
+    if (tensors->size() < static_cast<size_t>(col + 1)) {
+      tensors->resize(col + 1);
     }
 
     auto input_dim = ctx->GetInputDim("Input");
     framework::Tensor tmp;
     tmp.Resize(input_dim);
     (*tensors)[col].Resize(input_dim);
-    // need to handle LodTensor later
+
+    // TODO(qijun) need to handle LodTensor later
   }
 
   framework::DataType IndicateDataType(
diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h
index 95e7986a22..e8d5e3a9c0 100644
--- a/paddle/operators/fetch_op.h
+++ b/paddle/operators/fetch_op.h
@@ -26,13 +26,13 @@ class FetchKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     typedef std::vector<framework::Tensor> FetchOutputs;
-    Tensor* input = ctx.Output<Tensor>("Input");
+    const Tensor* input = ctx.Input<Tensor>("Input");
     int col = ctx.template Attr<int>("col");
     framework::Variable* g_fetch_variable =
         framework::GetScope()->FindVar("fetch_value");
-    FetchOutputs tensors = g_fetch_variable->Get<FetchOutputs>();
-    tensors[col].mutable_data<T>(platform::CPUPlace());
-    tensors[col].CopyFrom<T>(*input, platform::CPUPlace());
+    FetchOutputs* tensors = g_fetch_variable->GetMutable<FetchOutputs>();
+    (*tensors)[col].mutable_data<T>(platform::CPUPlace());
+    (*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace());
   }
 };
 

From bbceb72398f23902fae2f011c2b6c7f2a8b7b8e3 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 5 Oct 2017 20:54:16 -0700
Subject: [PATCH 21/61] refine some codes

---
 paddle/framework/executor.cc      | 10 ----------
 paddle/framework/executor_test.cc |  2 ++
 paddle/framework/scope.cc         |  9 ++-------
 paddle/operators/feed_op.cc       |  2 +-
 paddle/operators/fetch_op.cc      |  2 +-
 5 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 51ddb7e58e..ee0df039ac 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -74,16 +74,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
   for (auto& device_context : device_contexts_) {
     device_context->Wait();
   }
-  // // print tensor value
-  // for (auto& var : block.vars()) {
-  //   std::cout << var.name() << std::endl;
-  //   auto v = scope->FindVar(var.name());
-  //   const LoDTensor& t = v->Get<LoDTensor>();
-  //   for (int i = 0; i < t.numel(); ++i) {
-  //     std::cout << t.data<float>()[i] << " ";
-  //   }
-  //   std::cout << std::endl;
-  // }
 }
 
 }  // namespace framework
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index d3ea18d154..5e327cc893 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -130,6 +130,7 @@ std::once_flag set_variable_flag;
 template <typename T>
 void set_feed_variable(const std::vector<std::vector<T>>& inputs) {
   typedef std::vector<paddle::framework::Tensor> FeedInputs;
+  // Tensors in feed value variable will only be in CPUPlace
   Variable* g_feed_value = GetScope()->FindVar("feed_value");
   FeedInputs& feed_inputs = *(g_feed_value->GetMutable<FeedInputs>());
   auto size = inputs.size();
@@ -144,6 +145,7 @@ void set_feed_variable(const std::vector<std::vector<T>>& inputs) {
 template <typename T>
 std::vector<std::vector<T>> get_fetch_variable() {
   typedef std::vector<paddle::framework::Tensor> FetchOutputs;
+  // Tensors in fetch value variable will only be in CPUPlace
   Variable* g_fetch_value = GetScope()->FindVar("fetch_value");
   FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable<FetchOutputs>());
 
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 2c416570cf..b6a9d7fbc2 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -66,15 +66,10 @@ void Scope::DropKids() {
 
 std::once_flag feed_variable_flag;
 
-template <typename T, typename... Args>
-std::unique_ptr<T> make_unique(Args&&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
 framework::Scope* GetScope() {
-  static std::unique_ptr<framework::Scope> g_scope =
-      make_unique<framework::Scope>();
+  static std::unique_ptr<framework::Scope> g_scope{nullptr};
   std::call_once(feed_variable_flag, [&]() {
+    g_scope.reset(new framework::Scope());
     g_scope->NewVar("feed_value");
     g_scope->NewVar("fetch_value");
   });
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index d40db3ff2e..f2c498e2e2 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -33,7 +33,7 @@ class FeedOp : public framework::OperatorWithKernel {
 
     auto in_dim = tensors[col].dims();
     ctx->SetOutputDim("Out", in_dim);
-    // TODO(qijun) need to handle LodTensor later
+    // TODO(qijun): need to handle LodTensor later
   }
 
   framework::DataType IndicateDataType(
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index a885deacc8..f6882cbd03 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -39,7 +39,7 @@ class FetchOp : public framework::OperatorWithKernel {
     tmp.Resize(input_dim);
     (*tensors)[col].Resize(input_dim);
 
-    // TODO(qijun) need to handle LodTensor later
+    // TODO(qijun): need to handle LodTensor later
   }
 
   framework::DataType IndicateDataType(

From 1f5192a27b968a7980c2eead7b6885e66f09575a Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 6 Oct 2017 11:06:59 -0700
Subject: [PATCH 22/61] fix executor gpu unittest

---
 paddle/framework/executor.cc      |  2 +-
 paddle/framework/executor_test.cc | 20 +++++++++++++++-----
 paddle/operators/fetch_op.cu      |  2 +-
 paddle/platform/gpu_info.cc       |  3 ++-
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index ee0df039ac..c18ba049c8 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -30,7 +30,7 @@ Executor::Executor(const std::vector<platform::Place>& places) {
       device_contexts_[i] = new platform::CPUDeviceContext(
           boost::get<platform::CPUPlace>(places[i]));
     } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
       device_contexts_[i] = new platform::CUDADeviceContext(
           boost::get<platform::GPUPlace>(places[i]));
 #else
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 5e327cc893..55e209628b 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -293,7 +293,7 @@ TEST_F(ExecutorTesterFeed, CPU) {
   delete executor;
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST_F(ExecutorTesterRandom, GPU) {
   std::vector<Place> places;
   GPUPlace gpu_place(0);
@@ -315,10 +315,20 @@ TEST_F(ExecutorTesterFeed, GPU) {
 
   Executor* executor = new Executor(places);
 
-  // need to set feed variable before Executor::Run
-  set_feed_variable<float>(inputs_);
-  executor->Run(pdesc_, GetScope());
-
+  // 3 mini-batch
+  for (int i = 0; i < 3; i++) {
+    // need to set feed variable before Executor::Run
+    std::cout << "start mini-batch " << i << std::endl;
+    set_feed_variable<float>(inputs_);
+    executor->Run(pdesc_, GetScope());
+    std::vector<std::vector<float>> result = get_fetch_variable<float>();
+    for (auto& vec : result) {
+      for (auto& num : vec) {
+        std::cout << num << " ";
+      }
+      std::cout << std::endl;
+    }
+  }
   delete executor;
 }
 #endif
diff --git a/paddle/operators/fetch_op.cu b/paddle/operators/fetch_op.cu
index 2e24d3a8ad..ca39d24c79 100644
--- a/paddle/operators/fetch_op.cu
+++ b/paddle/operators/fetch_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/feed_op.h"
+#include "paddle/operators/fetch_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(fetch, ops::FetchKernel<float>);
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 486dcd623a..aa76bb209d 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -43,7 +43,8 @@ int GetCurrentDeviceId() {
 }
 
 void SetDeviceId(int id) {
-  PADDLE_ENFORCE(id < GetDeviceCount(), "id must less than GPU count");
+  // TODO(qijun): find a better way to cache the cuda device count
+  PADDLE_ENFORCE(id < GetCUDADeviceCount(), "id must less than GPU count");
   PADDLE_ENFORCE(cudaSetDevice(id),
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }

From ac0e3828b49cab7fd943c293516917cfdc4404ee Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Fri, 6 Oct 2017 21:15:29 +0000
Subject: [PATCH 23/61] test text

---
 paddle/framework/executor.cc | 102 +++++++++++++++++++++++++++++------
 paddle/framework/executor.h  |  16 ++++++
 2 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index aafef12554..89b83f82fb 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/executor.h"
+#include <algorithm>
 #include <iostream>
 #include <memory>
+#include <set>
 #include <vector>
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 
+#include <boost/range/adaptor/reversed.hpp>
+
 namespace paddle {
 namespace framework {
 
@@ -64,26 +68,94 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
     scope->NewVar(var.name());
   }
 
-  for (auto& op_desc : block.ops()) {
-    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    std::cout << op->DebugString() << std::endl;
-    op->Run(*scope, *device);
+  std::vector<bool> should_run = Preprocess(pdesc);
+  PADDLE_ENFORCE(should_run.size() == block.ops_size(),
+                 "should_run.size() != block.ops_size()");
+  for (int i = 0; i < should_run.size(); ++i) {
+    if (should_run[i]) {
+      auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i));
+      std::cout << op->DebugString() << std::endl;
+      op->Run(*scope, *device);
+    }
   }
 
-  // TODO(tonyyang-svail): need to test gpu device
-  for (auto& device_context : device_contexts_) {
-    device_context->Wait();
-  }
   // // print tensor value
-  for (auto& var : block.vars()) {
-    std::cout << var.name() << std::endl;
-    auto v = scope->FindVar(var.name());
-    const LoDTensor& t = v->Get<LoDTensor>();
-    for (int i = 0; i < t.numel(); ++i) {
-      std::cout << t.data<float>()[i] << " ";
+  // for (auto& var : block.vars()) {
+  //   std::cout << var.name() << std::endl;
+  //   auto v = scope->FindVar(var.name());
+  //   const LoDTensor& t = v->Get<LoDTensor>();
+  //   for (int i = 0; i < t.numel(); ++i) {
+  //     std::cout << t.data<float>()[i] << " ";
+  //   }
+  //   std::cout << std::endl;
+  // }
+}
+
+std::vector<bool> Executor::Preprocess(const ProgramDesc& pdesc) {
+  // TODO(tonyyang-svail):
+  //    - only runs the first block
+
+  auto& block = pdesc.blocks(0);
+  auto& ops = block.ops();
+
+  bool expect_feed = true;
+  for (auto& op_desc : ops) {
+    PADDLE_ENFORCE(op_desc.type() != "feed" || expect_feed,
+                   "All FeedOps are at the beginning of the ProgramDesc");
+    expect_feed = (op_desc.type() == "feed");
+  }
+
+  bool expect_fetch = true;
+  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
+    auto& op_desc = *op_iter;
+    PADDLE_ENFORCE(op_desc.type() != "fetch" || expect_fetch,
+                   "All FetchOps must at the end of the ProgramDesc");
+    expect_fetch = (op_desc.type() == "fetch");
+  }
+
+  std::set<std::string> dependent_vars;
+  std::vector<bool> should_run;
+  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
+    auto& op_desc = *op_iter;
+
+    bool found_dependent_vars = false;
+    for (auto& var : op_desc.outputs()) {
+      for (auto& argu : var.arguments()) {
+        if (dependent_vars.count(argu) != 0) {
+          found_dependent_vars = true;
+        }
+      }
+    }
+
+    // TODO(tonyyang-svail): add VLOG here for debugging
+    if (op_desc.type() == "fetch" || found_dependent_vars) {
+      // erase its output to the dependency graph
+      for (auto& var : op_desc.outputs()) {
+        for (auto& argu : var.arguments()) {
+          dependent_vars.erase(argu);
+        }
+      }
+
+      // insert its input to the dependency graph
+      for (auto& var : op_desc.inputs()) {
+        for (auto& argu : var.arguments()) {
+          dependent_vars.insert(argu);
+        }
+      }
+
+      // this op should be executed
+      should_run.push_back(true);
+    } else {
+      // this op should NOT be executed
+      should_run.push_back(false);
     }
-    std::cout << std::endl;
   }
+
+  // since we are traversing the ProgramDesc in reverse order
+  // we reverse the should_run vector
+  std::reverse(should_run.begin(), should_run.end());
+
+  return should_run;
 }
 
 }  // namespace framework
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index 9e443c8fca..1d2e6c96de 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -26,8 +26,24 @@ class Executor {
  public:
   explicit Executor(const std::vector<platform::Place>& places);
   ~Executor();
+
+  /* @Brief
+   * Runtime evaluation of the given ProgramDesc under certain Scope
+   *
+   * @param
+   *  ProgramDesc
+   *  Scope
+   */
   void Run(const ProgramDesc&, Scope*);
 
+ protected:
+  /* @Brief
+   *
+   * @param
+   *  ProgramDesc
+   */
+  std::vector<bool> Preprocess(const ProgramDesc& pdesc);
+
  private:
   std::vector<platform::DeviceContext*> device_contexts_;
 };

From e8a678e1eecd11fee219a93c6c586ee24663a506 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 6 Oct 2017 22:46:04 +0000
Subject: [PATCH 24/61] fix executor gpu unittest runtime error

---
 paddle/framework/executor_test.cc | 19 ++++++++++++++++---
 paddle/operators/fetch_op.cc      |  2 --
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 55e209628b..82f9bd6f2d 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -239,6 +239,7 @@ class ExecutorTesterFeed : public ::testing::Test {
   std::vector<std::vector<float>> inputs_;
 };
 
+#ifndef PADDLE_WITH_CUDA
 TEST_F(ExecutorTesterRandom, CPU) {
   std::vector<Place> places;
   CPUPlace cpu_place;
@@ -292,13 +293,19 @@ TEST_F(ExecutorTesterFeed, CPU) {
 
   delete executor;
 }
-
-#ifdef PADDLE_WITH_CUDA
+#else
 TEST_F(ExecutorTesterRandom, GPU) {
   std::vector<Place> places;
   GPUPlace gpu_place(0);
   places.push_back(gpu_place);
 
+  // We have a global Scope and BuddyAllocator, and we must ensure
+  // global BuddyAllocator is initialized before global Scope. Thus,
+  // global Scope will deconstruct before BuddyAllocator. Otherwise,
+  // "pointer being freed was not allocated" error will appear.
+  // If paddle is compiled with GPU, both CPU and GPU BuddyAllocator
+  // need to be used at first.
+  paddle::memory::Used(CPUPlace());
   paddle::memory::Used(gpu_place);
 
   Executor* executor = new Executor(places);
@@ -310,7 +317,13 @@ TEST_F(ExecutorTesterFeed, GPU) {
   std::vector<Place> places;
   GPUPlace gpu_place(0);
   places.push_back(gpu_place);
-
+  // We have a global Scope and BuddyAllocator, and we must ensure
+  // global BuddyAllocator is initialized before global Scope. Thus,
+  // global Scope will deconstruct before BuddyAllocator. Otherwise,
+  // "pointer being freed was not allocated" error will appear.
+  // If paddle is compiled with GPU, both CPU and GPU BuddyAllocator
+  // need to be used at first.
+  paddle::memory::Used(CPUPlace());
   paddle::memory::Used(gpu_place);
 
   Executor* executor = new Executor(places);
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index f6882cbd03..4b6b3ca85a 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -35,8 +35,6 @@ class FetchOp : public framework::OperatorWithKernel {
     }
 
     auto input_dim = ctx->GetInputDim("Input");
-    framework::Tensor tmp;
-    tmp.Resize(input_dim);
     (*tensors)[col].Resize(input_dim);
 
     // TODO(qijun): need to handle LodTensor later

From 91f5d2b9cb23cbb6048180ed791e53659532cf04 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 6 Oct 2017 16:09:19 -0700
Subject: [PATCH 25/61] follow comments and create local_scope inside executor
 run method

---
 paddle/framework/executor.cc      |  6 ++----
 paddle/framework/executor_test.cc | 12 ++++++------
 paddle/framework/scope.cc         |  2 +-
 paddle/framework/scope.h          |  2 +-
 paddle/operators/feed_op.cc       |  2 +-
 paddle/operators/feed_op.h        |  6 ++----
 paddle/operators/fetch_op.cc      |  2 +-
 paddle/operators/fetch_op.h       |  6 ++----
 8 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index c18ba049c8..7fc407ebc9 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -56,9 +56,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
   auto& block = pdesc.blocks(0);
   auto& device = device_contexts_[0];
 
-  // TODO(tonyyang-svail):
-  //    - runs on a new local scope
-  // Scope& local_scope = scope->NewScope();
+  Scope& local_scope = scope->NewScope();
 
   for (auto& var : block.vars()) {
     scope->NewVar(var.name());
@@ -67,7 +65,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
   for (auto& op_desc : block.ops()) {
     auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
     std::cout << op->DebugString() << std::endl;
-    op->Run(*scope, *device);
+    op->Run(local_scope, *device);
   }
 
   // TODO(tonyyang-svail): need to test gpu device
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 82f9bd6f2d..bf6c1dffc1 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -131,7 +131,7 @@ template <typename T>
 void set_feed_variable(const std::vector<std::vector<T>>& inputs) {
   typedef std::vector<paddle::framework::Tensor> FeedInputs;
   // Tensors in feed value variable will only be in CPUPlace
-  Variable* g_feed_value = GetScope()->FindVar("feed_value");
+  Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value");
   FeedInputs& feed_inputs = *(g_feed_value->GetMutable<FeedInputs>());
   auto size = inputs.size();
   feed_inputs.resize(size);
@@ -146,7 +146,7 @@ template <typename T>
 std::vector<std::vector<T>> get_fetch_variable() {
   typedef std::vector<paddle::framework::Tensor> FetchOutputs;
   // Tensors in fetch value variable will only be in CPUPlace
-  Variable* g_fetch_value = GetScope()->FindVar("fetch_value");
+  Variable* g_fetch_value = GetGlobalScope()->FindVar("fetch_value");
   FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable<FetchOutputs>());
 
   auto size = fetch_outputs.size();
@@ -252,7 +252,7 @@ TEST_F(ExecutorTesterRandom, CPU) {
   paddle::memory::Used(cpu_place);
 
   Executor* executor = new Executor(places);
-  executor->Run(pdesc_, GetScope());
+  executor->Run(pdesc_, GetGlobalScope());
   std::vector<std::vector<float>> result = get_fetch_variable<float>();
   for (auto& vec : result) {
     for (auto& num : vec) {
@@ -281,7 +281,7 @@ TEST_F(ExecutorTesterFeed, CPU) {
     // need to set feed variable before Executor::Run
     std::cout << "start mini-batch " << i << std::endl;
     set_feed_variable<float>(inputs_);
-    executor->Run(pdesc_, GetScope());
+    executor->Run(pdesc_, GetGlobalScope());
     std::vector<std::vector<float>> result = get_fetch_variable<float>();
     for (auto& vec : result) {
       for (auto& num : vec) {
@@ -309,7 +309,7 @@ TEST_F(ExecutorTesterRandom, GPU) {
   paddle::memory::Used(gpu_place);
 
   Executor* executor = new Executor(places);
-  executor->Run(pdesc_, GetScope());
+  executor->Run(pdesc_, GetGlobalScope());
   delete executor;
 }
 
@@ -333,7 +333,7 @@ TEST_F(ExecutorTesterFeed, GPU) {
     // need to set feed variable before Executor::Run
     std::cout << "start mini-batch " << i << std::endl;
     set_feed_variable<float>(inputs_);
-    executor->Run(pdesc_, GetScope());
+    executor->Run(pdesc_, GetGlobalScope());
     std::vector<std::vector<float>> result = get_fetch_variable<float>();
     for (auto& vec : result) {
       for (auto& num : vec) {
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index b6a9d7fbc2..2a0d9bbf33 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -66,7 +66,7 @@ void Scope::DropKids() {
 
 std::once_flag feed_variable_flag;
 
-framework::Scope* GetScope() {
+framework::Scope* GetGlobalScope() {
   static std::unique_ptr<framework::Scope> g_scope{nullptr};
   std::call_once(feed_variable_flag, [&]() {
     g_scope.reset(new framework::Scope());
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 96f3ae875b..319d291efe 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -73,7 +73,7 @@ class Scope {
   DISABLE_COPY_AND_ASSIGN(Scope);
 };
 
-framework::Scope* GetScope();
+framework::Scope* GetGlobalScope();
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index f2c498e2e2..b9e43be966 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -27,7 +27,7 @@ class FeedOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null.");
     int col = ctx->Attrs().Get<int>("col");
     framework::Variable* g_feed_variable =
-        framework::GetScope()->FindVar("feed_value");
+        framework::GetGlobalScope()->FindVar("feed_value");
 
     const FeedInputs& tensors = g_feed_variable->Get<FeedInputs>();
 
diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h
index cf93b6f434..de8ec6ff61 100644
--- a/paddle/operators/feed_op.h
+++ b/paddle/operators/feed_op.h
@@ -19,17 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-
 template <typename T>
 class FeedKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     typedef std::vector<framework::Tensor> FeedInputs;
-    Tensor* out = ctx.Output<Tensor>("Out");
+    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
     framework::Variable* g_feed_variable =
-        framework::GetScope()->FindVar("feed_value");
+        framework::GetGlobalScope()->FindVar("feed_value");
     int col = ctx.template Attr<int>("col");
     const FeedInputs& tensors = g_feed_variable->Get<FeedInputs>();
     out->CopyFrom<T>(tensors[col], ctx.GetPlace());
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 4b6b3ca85a..7bde4953cd 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -27,7 +27,7 @@ class FetchOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null.");
     int col = ctx->Attrs().Get<int>("col");
     framework::Variable* g_fetch_variable =
-        framework::GetScope()->FindVar("fetch_value");
+        framework::GetGlobalScope()->FindVar("fetch_value");
 
     FetchOutputs* tensors = g_fetch_variable->GetMutable<FetchOutputs>();
     if (tensors->size() < static_cast<size_t>(col + 1)) {
diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h
index e8d5e3a9c0..3bec9c9974 100644
--- a/paddle/operators/fetch_op.h
+++ b/paddle/operators/fetch_op.h
@@ -19,17 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-
 template <typename T>
 class FetchKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     typedef std::vector<framework::Tensor> FetchOutputs;
-    const Tensor* input = ctx.Input<Tensor>("Input");
+    const framework::Tensor* input = ctx.Input<framework::Tensor>("Input");
     int col = ctx.template Attr<int>("col");
     framework::Variable* g_fetch_variable =
-        framework::GetScope()->FindVar("fetch_value");
+        framework::GetGlobalScope()->FindVar("fetch_value");
     FetchOutputs* tensors = g_fetch_variable->GetMutable<FetchOutputs>();
     (*tensors)[col].mutable_data<T>(platform::CPUPlace());
     (*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace());

From a7d700e0ba35e78cfbe85acf2d0b4cb72d22b10f Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 6 Oct 2017 16:30:44 -0700
Subject: [PATCH 26/61] revert local scope to TODO

---
 paddle/framework/executor.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 7fc407ebc9..c18ba049c8 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -56,7 +56,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
   auto& block = pdesc.blocks(0);
   auto& device = device_contexts_[0];
 
-  Scope& local_scope = scope->NewScope();
+  // TODO(tonyyang-svail):
+  //    - runs on a new local scope
+  // Scope& local_scope = scope->NewScope();
 
   for (auto& var : block.vars()) {
     scope->NewVar(var.name());
@@ -65,7 +67,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
   for (auto& op_desc : block.ops()) {
     auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
     std::cout << op->DebugString() << std::endl;
-    op->Run(local_scope, *device);
+    op->Run(*scope, *device);
   }
 
   // TODO(tonyyang-svail): need to test gpu device

From b68a95f7f488f8ff94f4793ec294121aa004d02d Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Sat, 7 Oct 2017 00:47:19 +0000
Subject: [PATCH 27/61] prune pass simple test

---
 paddle/framework/executor.cc      | 2 ++
 paddle/framework/executor.h       | 4 ++++
 paddle/framework/executor_test.cc | 1 +
 3 files changed, 7 insertions(+)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 10d22ba01c..4f217277d0 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -145,9 +145,11 @@ std::vector<bool> Executor::Preprocess(const ProgramDesc& pdesc) {
 
       // this op should be executed
       should_run.push_back(true);
+      LOG(INFO) << "Yes " << op_desc.type();
     } else {
       // this op should NOT be executed
       should_run.push_back(false);
+      LOG(INFO) << "No " << op_desc.type();
     }
   }
 
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index 1d2e6c96de..75cb5939ff 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -38,9 +38,13 @@ class Executor {
 
  protected:
   /* @Brief
+   * Pruning the graph
    *
    * @param
    *  ProgramDesc
+   *
+   * @return
+   *  vector<bool> Same size as ops. Indicates whether an op should be run.
    */
   std::vector<bool> Preprocess(const ProgramDesc& pdesc);
 
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index bf6c1dffc1..6a4b2e3d1a 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -226,6 +226,7 @@ class ExecutorTesterFeed : public ::testing::Test {
     Out->set_parameter("Out");
     Out->add_arguments("c");
 
+    add_fetch_op("a", dim, 0, root_block);
     add_fetch_op("c", dim, 0, root_block);
 
     std::vector<float> vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};

From 005f15b4957fcce594e1a3b8a27be1c1723ab0fc Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Sat, 7 Oct 2017 21:46:00 +0000
Subject: [PATCH 28/61] FeedOp and FetchOp unit test

---
 paddle/framework/executor.cc      |  6 ++--
 paddle/framework/executor_test.cc | 56 +++++++++++--------------------
 2 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 4f217277d0..9391e18ded 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -69,12 +69,10 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
   }
 
   std::vector<bool> should_run = Preprocess(pdesc);
-  PADDLE_ENFORCE(should_run.size() == block.ops_size(),
-                 "should_run.size() != block.ops_size()");
-  for (int i = 0; i < should_run.size(); ++i) {
+  PADDLE_ENFORCE(should_run.size() == block.ops_size());
+  for (size_t i = 0; i < should_run.size(); ++i) {
     if (should_run[i]) {
       auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i));
-      std::cout << op->DebugString() << std::endl;
       op->Run(*scope, *device);
     }
   }
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 6a4b2e3d1a..b198fa143c 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -127,10 +127,11 @@ void add_fetch_op(string var_name, std::vector<int>& dim, int index,
 
 std::once_flag set_variable_flag;
 
+// Tensors in feed value variable will only be in CPUPlace
+// So we can  memcpy the data from vector<T> to feed_value
 template <typename T>
 void set_feed_variable(const std::vector<std::vector<T>>& inputs) {
   typedef std::vector<paddle::framework::Tensor> FeedInputs;
-  // Tensors in feed value variable will only be in CPUPlace
   Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value");
   FeedInputs& feed_inputs = *(g_feed_value->GetMutable<FeedInputs>());
   auto size = inputs.size();
@@ -142,10 +143,11 @@ void set_feed_variable(const std::vector<std::vector<T>>& inputs) {
   }
 }
 
+// Tensors in fetch value variable will only be in CPUPlace
+// So we can memcpy the data from fetch_value to vector<T>
 template <typename T>
 std::vector<std::vector<T>> get_fetch_variable() {
   typedef std::vector<paddle::framework::Tensor> FetchOutputs;
-  // Tensors in fetch value variable will only be in CPUPlace
   Variable* g_fetch_value = GetGlobalScope()->FindVar("fetch_value");
   FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable<FetchOutputs>());
 
@@ -159,6 +161,7 @@ std::vector<std::vector<T>> get_fetch_variable() {
            fetch_outputs[i].numel() * sizeof(T));
     result.push_back(tmp);
   }
+
   return result;
 }
 
@@ -197,7 +200,7 @@ class ExecutorTesterRandom : public ::testing::Test {
   ProgramDesc pdesc_;
 };
 
-class ExecutorTesterFeed : public ::testing::Test {
+class ExecutorTesterFeedAndFetch : public ::testing::Test {
  public:
   virtual void SetUp() override {
     auto root_block = pdesc_.add_blocks();
@@ -208,26 +211,8 @@ class ExecutorTesterFeed : public ::testing::Test {
 
     add_feed_op("a", dim, 0, root_block);
     add_feed_op("b", dim, 1, root_block);
-
-    auto c = root_block->add_vars();
-    c->set_name("c");
-    auto c_lt = c->mutable_lod_tensor();
-    c_lt->set_data_type(paddle::framework::DataType::FP32);
-
-    auto op = root_block->add_ops();
-    op->set_type("elementwise_add");
-    auto X = op->add_inputs();
-    X->set_parameter("X");
-    X->add_arguments("a");
-    auto Y = op->add_inputs();
-    Y->set_parameter("Y");
-    Y->add_arguments("b");
-    auto Out = op->add_outputs();
-    Out->set_parameter("Out");
-    Out->add_arguments("c");
-
     add_fetch_op("a", dim, 0, root_block);
-    add_fetch_op("c", dim, 0, root_block);
+    add_fetch_op("b", dim, 1, root_block);
 
     std::vector<float> vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     std::vector<float> vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
@@ -255,6 +240,7 @@ TEST_F(ExecutorTesterRandom, CPU) {
   Executor* executor = new Executor(places);
   executor->Run(pdesc_, GetGlobalScope());
   std::vector<std::vector<float>> result = get_fetch_variable<float>();
+
   for (auto& vec : result) {
     for (auto& num : vec) {
       std::cout << num << " ";
@@ -264,7 +250,7 @@ TEST_F(ExecutorTesterRandom, CPU) {
   delete executor;
 }
 
-TEST_F(ExecutorTesterFeed, CPU) {
+TEST_F(ExecutorTesterFeedAndFetch, CPU) {
   std::vector<Place> places;
   CPUPlace cpu_place;
   places.push_back(cpu_place);
@@ -279,16 +265,15 @@ TEST_F(ExecutorTesterFeed, CPU) {
 
   // 3 mini-batch
   for (int i = 0; i < 3; i++) {
-    // need to set feed variable before Executor::Run
-    std::cout << "start mini-batch " << i << std::endl;
     set_feed_variable<float>(inputs_);
     executor->Run(pdesc_, GetGlobalScope());
     std::vector<std::vector<float>> result = get_fetch_variable<float>();
-    for (auto& vec : result) {
-      for (auto& num : vec) {
-        std::cout << num << " ";
+    PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
+    for (size_t i = 0; i < result.size(); ++i) {
+      PADDLE_ENFORCE_EQ(result[i].size(), inputs_[i].size());
+      for (size_t j = 0; j < result[i].size(); ++j) {
+        PADDLE_ENFORCE_EQ(result[i][j], inputs_[i][j]);
       }
-      std::cout << std::endl;
     }
   }
 
@@ -314,7 +299,7 @@ TEST_F(ExecutorTesterRandom, GPU) {
   delete executor;
 }
 
-TEST_F(ExecutorTesterFeed, GPU) {
+TEST_F(ExecutorTesterFeedAndFetch, GPU) {
   std::vector<Place> places;
   GPUPlace gpu_place(0);
   places.push_back(gpu_place);
@@ -331,16 +316,15 @@ TEST_F(ExecutorTesterFeed, GPU) {
 
   // 3 mini-batch
   for (int i = 0; i < 3; i++) {
-    // need to set feed variable before Executor::Run
-    std::cout << "start mini-batch " << i << std::endl;
     set_feed_variable<float>(inputs_);
     executor->Run(pdesc_, GetGlobalScope());
     std::vector<std::vector<float>> result = get_fetch_variable<float>();
-    for (auto& vec : result) {
-      for (auto& num : vec) {
-        std::cout << num << " ";
+    PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
+    for (size_t i = 0; i < result.size(); ++i) {
+      PADDLE_ENFORCE_EQ(result[i].size(), inputs_[i].size());
+      for (size_t j = 0; j < result[i].size(); ++j) {
+        PADDLE_ENFORCE_EQ(result[i][j], inputs_[i][j]);
       }
-      std::cout << std::endl;
     }
   }
   delete executor;

From a67e8ea3eb8475a17f6285e5cfbe1bf231e0bd28 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Sun, 8 Oct 2017 04:49:10 +0000
Subject: [PATCH 29/61] Add AddOp

---
 paddle/framework/executor_test.cc | 147 +++++++++++++++++++++++++-----
 1 file changed, 125 insertions(+), 22 deletions(-)

diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index b198fa143c..cf1752f6d8 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/block_desc.h"
 #include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 
@@ -24,6 +26,7 @@ USE_OP(elementwise_add);
 USE_OP(gaussian_random);
 USE_OP(feed);
 USE_OP(fetch);
+USE_OP(mul);
 
 using std::string;
 using namespace paddle::platform;
@@ -32,7 +35,71 @@ using namespace paddle::framework;
 typedef paddle::framework::BlockDesc proto_block;
 typedef paddle::framework::OpDesc proto_op;
 
-void add_gaussian_random_op(string var_name, std::vector<int>& dim,
+struct SetAttrDescVisitor : public boost::static_visitor<void> {
+  explicit SetAttrDescVisitor(OpDesc::Attr* attr) : attr_(attr) {}
+  mutable OpDesc::Attr* attr_;
+  void operator()(int v) const { attr_->set_i(v); }
+  void operator()(float v) const { attr_->set_f(v); }
+  void operator()(const std::string& v) const { attr_->set_s(v); }
+  void operator()(bool b) const { attr_->set_b(b); }
+
+  void operator()(const std::vector<int>& v) const {
+    VectorToRepeated(v, attr_->mutable_ints());
+  }
+  void operator()(const std::vector<float>& v) const {
+    VectorToRepeated(v, attr_->mutable_floats());
+  }
+  void operator()(const std::vector<std::string>& v) const {
+    VectorToRepeated(v, attr_->mutable_strings());
+  }
+  void operator()(const std::vector<bool>& v) const {
+    VectorToRepeated(v, attr_->mutable_bools());
+  }
+  void operator()(BlockDesc* desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+};
+
+void AddOp(const std::string& type, const VariableNameMap& inputs,
+           const VariableNameMap& outputs, AttributeMap attrs,
+           proto_block* block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->add_vars();
+      var->set_name(v);
+      auto var_lt = var->mutable_lod_tensor();
+      var_lt->set_data_type(paddle::framework::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->add_ops();
+  op->set_type(type);
+  for (auto kv : inputs) {
+    auto X = op->add_inputs();
+    X->set_parameter(kv.first);
+    for (auto argu : kv.second) {
+      X->add_arguments(argu);
+    }
+  }
+  for (auto kv : outputs) {
+    auto X = op->add_outputs();
+    X->set_parameter(kv.first);
+    for (auto argu : kv.second) {
+      X->add_arguments(argu);
+    }
+  }
+  for (auto& attr : attrs) {
+    auto* attr_desc = op->add_attrs();
+    attr_desc->set_name(attr.first);
+    attr_desc->set_type(
+        static_cast<paddle::framework::AttrType>(attr.second.which() - 1));
+    SetAttrDescVisitor visitor(attr_desc);
+    boost::apply_visitor(visitor, attr.second);
+  }
+}
+
+void add_gaussian_random_op(string var_name, std::vector<int> dim,
                             proto_block* block) {
   // insert variable
   auto a = block->add_vars();
@@ -91,7 +158,7 @@ void add_feed_op(string var_name, std::vector<int>& dim, int index,
   Out->add_arguments(var_name);
 }
 
-void add_fetch_op(string var_name, std::vector<int>& dim, int index,
+void add_fetch_op(string var_name, std::vector<int> dim, int index,
                   proto_block* block) {
   // insert variable
   auto a = block->add_vars();
@@ -125,6 +192,28 @@ void add_fetch_op(string var_name, std::vector<int>& dim, int index,
   Out->add_arguments(var_name);
 }
 
+void add_mul_op(string X_str, string Y_str, string Out_str,
+                proto_block* block) {
+  // insert variable
+  auto a = block->add_vars();
+  a->set_name(Out_str);
+  auto a_lt = a->mutable_lod_tensor();
+  a_lt->set_data_type(paddle::framework::DataType::FP32);
+
+  // insert op
+  auto op = block->add_ops();
+  op->set_type("mul");
+  auto X = op->add_inputs();
+  X->set_parameter("X");
+  X->add_arguments(X_str);
+  auto Y = op->add_inputs();
+  Y->set_parameter("Y");
+  Y->add_arguments(Y_str);
+  auto Out = op->add_outputs();
+  Out->set_parameter("Out");
+  Out->add_arguments(Out_str);
+}
+
 std::once_flag set_variable_flag;
 
 // Tensors in feed value variable will only be in CPUPlace
@@ -168,36 +257,37 @@ std::vector<std::vector<T>> get_fetch_variable() {
 class ExecutorTesterRandom : public ::testing::Test {
  public:
   virtual void SetUp() override {
+    int input_dim = 5, batch_size = 2, embed_dim = 5;
+
+    // init pdesc
+    auto init_root_block = init_pdesc_.add_blocks();
+    init_root_block->set_idx(0);
+    init_root_block->set_parent_idx(-1);
+    AddOp("gaussian_random", {}, {{"Out", {"w1"}}},
+          {{"dims", std::vector<int>{input_dim, embed_dim}}}, init_root_block);
+    AddOp("gaussian_random", {}, {{"Out", {"w2"}}},
+          {{"dims", std::vector<int>{embed_dim, input_dim}}}, init_root_block);
+    AddOp("fetch", {{"Input", {"w1"}}}, {},
+          {{"dims", std::vector<int>{input_dim, embed_dim}}}, init_root_block);
+    AddOp("fetch", {{"Input", {"w2"}}}, {},
+          {{"dims", std::vector<int>{embed_dim, input_dim}}}, init_root_block);
+
+    // run pdesc
     auto root_block = pdesc_.add_blocks();
     root_block->set_idx(0);
     root_block->set_parent_idx(-1);
 
-    std::vector<int> dim{2, 3};
-    add_gaussian_random_op("a", dim, root_block);
-    add_gaussian_random_op("b", dim, root_block);
+    add_gaussian_random_op("a", {batch_size, input_dim}, root_block);
 
-    auto c = root_block->add_vars();
-    c->set_name("c");
-    auto c_lt = c->mutable_lod_tensor();
-    c_lt->set_data_type(paddle::framework::DataType::FP32);
+    add_mul_op("a", "w1", "b", root_block);
+    add_mul_op("b", "w2", "a_out", root_block);
 
-    auto op = root_block->add_ops();
-    op->set_type("elementwise_add");
-    auto X = op->add_inputs();
-    X->set_parameter("X");
-    X->add_arguments("a");
-    auto Y = op->add_inputs();
-    Y->set_parameter("Y");
-    Y->add_arguments("b");
-    auto Out = op->add_outputs();
-    Out->set_parameter("Out");
-    Out->add_arguments("c");
-
-    add_fetch_op("c", dim, 0, root_block);
+    add_fetch_op("a_out", {input_dim, batch_size}, 0, root_block);
   }
 
  protected:
   ProgramDesc pdesc_;
+  ProgramDesc init_pdesc_;
 };
 
 class ExecutorTesterFeedAndFetch : public ::testing::Test {
@@ -238,6 +328,7 @@ TEST_F(ExecutorTesterRandom, CPU) {
   paddle::memory::Used(cpu_place);
 
   Executor* executor = new Executor(places);
+  executor->Run(init_pdesc_, GetGlobalScope());
   executor->Run(pdesc_, GetGlobalScope());
   std::vector<std::vector<float>> result = get_fetch_variable<float>();
 
@@ -295,7 +386,19 @@ TEST_F(ExecutorTesterRandom, GPU) {
   paddle::memory::Used(gpu_place);
 
   Executor* executor = new Executor(places);
+
+  LOG(INFO) << "Run Init";
+  executor->Run(init_pdesc_, GetGlobalScope());
+  LOG(INFO) << "Run";
   executor->Run(pdesc_, GetGlobalScope());
+  std::vector<std::vector<float>> result = get_fetch_variable<float>();
+
+  for (auto& vec : result) {
+    for (auto& num : vec) {
+      std::cout << num << " ";
+    }
+    std::cout << std::endl;
+  }
   delete executor;
 }
 

From c83ea1cdca1b751b93a1c63ea8fa58706131951b Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Sun, 8 Oct 2017 05:11:40 +0000
Subject: [PATCH 30/61] remove hardcode add_XX_op

---
 paddle/framework/executor_test.cc | 147 +++++-------------------------
 1 file changed, 21 insertions(+), 126 deletions(-)

diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index cf1752f6d8..e8ea09b77d 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -99,121 +99,6 @@ void AddOp(const std::string& type, const VariableNameMap& inputs,
   }
 }
 
-void add_gaussian_random_op(string var_name, std::vector<int> dim,
-                            proto_block* block) {
-  // insert variable
-  auto a = block->add_vars();
-  a->set_name(var_name);
-  auto a_lt = a->mutable_lod_tensor();
-  a_lt->set_data_type(paddle::framework::DataType::FP32);
-  for (int i : dim) {
-    a_lt->add_dims(i);
-  }
-
-  // insert operation
-  auto op = block->add_ops();
-  op->set_type("gaussian_random");
-  auto dims = op->add_attrs();
-  dims->set_name("dims");
-  dims->set_type(paddle::framework::AttrType::INTS);
-  for (int i : dim) {
-    dims->add_ints(i);
-  }
-  auto Out = op->add_outputs();
-  Out->set_parameter("Out");
-  Out->add_arguments(var_name);
-}
-
-void add_feed_op(string var_name, std::vector<int>& dim, int index,
-                 proto_block* block) {
-  // insert variable
-  auto a = block->add_vars();
-  a->set_name(var_name);
-  auto a_lt = a->mutable_lod_tensor();
-  a_lt->set_data_type(paddle::framework::DataType::FP32);
-  for (int i : dim) {
-    a_lt->add_dims(i);
-  }
-
-  // insert operation
-  auto op = block->add_ops();
-  op->set_type("feed");
-
-  // set dims attr
-  auto dims = op->add_attrs();
-  dims->set_name("dims");
-  dims->set_type(paddle::framework::AttrType::INTS);
-  for (int i : dim) {
-    dims->add_ints(i);
-  }
-
-  // set col attr
-  auto col = op->add_attrs();
-  col->set_name("col");
-  col->set_type(paddle::framework::AttrType::INT);
-  col->set_i(index);
-
-  auto Out = op->add_outputs();
-  Out->set_parameter("Out");
-  Out->add_arguments(var_name);
-}
-
-void add_fetch_op(string var_name, std::vector<int> dim, int index,
-                  proto_block* block) {
-  // insert variable
-  auto a = block->add_vars();
-  a->set_name(var_name);
-  auto a_lt = a->mutable_lod_tensor();
-  a_lt->set_data_type(paddle::framework::DataType::FP32);
-  for (int i : dim) {
-    a_lt->add_dims(i);
-  }
-
-  // insert operation
-  auto op = block->add_ops();
-  op->set_type("fetch");
-
-  // set dims attr
-  auto dims = op->add_attrs();
-  dims->set_name("dims");
-  dims->set_type(paddle::framework::AttrType::INTS);
-  for (int i : dim) {
-    dims->add_ints(i);
-  }
-
-  // set col attr
-  auto col = op->add_attrs();
-  col->set_name("col");
-  col->set_type(paddle::framework::AttrType::INT);
-  col->set_i(index);
-
-  auto Out = op->add_inputs();
-  Out->set_parameter("Input");
-  Out->add_arguments(var_name);
-}
-
-void add_mul_op(string X_str, string Y_str, string Out_str,
-                proto_block* block) {
-  // insert variable
-  auto a = block->add_vars();
-  a->set_name(Out_str);
-  auto a_lt = a->mutable_lod_tensor();
-  a_lt->set_data_type(paddle::framework::DataType::FP32);
-
-  // insert op
-  auto op = block->add_ops();
-  op->set_type("mul");
-  auto X = op->add_inputs();
-  X->set_parameter("X");
-  X->add_arguments(X_str);
-  auto Y = op->add_inputs();
-  Y->set_parameter("Y");
-  Y->add_arguments(Y_str);
-  auto Out = op->add_outputs();
-  Out->set_parameter("Out");
-  Out->add_arguments(Out_str);
-}
-
 std::once_flag set_variable_flag;
 
 // Tensors in feed value variable will only be in CPUPlace
@@ -268,21 +153,27 @@ class ExecutorTesterRandom : public ::testing::Test {
     AddOp("gaussian_random", {}, {{"Out", {"w2"}}},
           {{"dims", std::vector<int>{embed_dim, input_dim}}}, init_root_block);
     AddOp("fetch", {{"Input", {"w1"}}}, {},
-          {{"dims", std::vector<int>{input_dim, embed_dim}}}, init_root_block);
+          {{"dims", std::vector<int>{input_dim, embed_dim}}, {"col", 0}},
+          init_root_block);
     AddOp("fetch", {{"Input", {"w2"}}}, {},
-          {{"dims", std::vector<int>{embed_dim, input_dim}}}, init_root_block);
+          {{"dims", std::vector<int>{embed_dim, input_dim}}, {"col", 1}},
+          init_root_block);
 
     // run pdesc
     auto root_block = pdesc_.add_blocks();
     root_block->set_idx(0);
     root_block->set_parent_idx(-1);
 
-    add_gaussian_random_op("a", {batch_size, input_dim}, root_block);
-
-    add_mul_op("a", "w1", "b", root_block);
-    add_mul_op("b", "w2", "a_out", root_block);
+    AddOp("gaussian_random", {}, {{"Out", {"a"}}},
+          {{"dims", std::vector<int>{batch_size, input_dim}}}, root_block);
+    AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {},
+          root_block);
+    AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {},
+          root_block);
 
-    add_fetch_op("a_out", {input_dim, batch_size}, 0, root_block);
+    AddOp("fetch", {{"Input", {"a_out"}}}, {},
+          {{"dims", std::vector<int>{input_dim, batch_size}}, {"col", 1}},
+          root_block);
   }
 
  protected:
@@ -299,10 +190,14 @@ class ExecutorTesterFeedAndFetch : public ::testing::Test {
 
     std::vector<int> dim{6};
 
-    add_feed_op("a", dim, 0, root_block);
-    add_feed_op("b", dim, 1, root_block);
-    add_fetch_op("a", dim, 0, root_block);
-    add_fetch_op("b", dim, 1, root_block);
+    AddOp("feed", {}, {{"Out", {"a"}}}, {{"dims", dim}, {"col", 0}},
+          root_block);
+    AddOp("feed", {}, {{"Out", {"b"}}}, {{"dims", dim}, {"col", 1}},
+          root_block);
+    AddOp("fetch", {{"Input", {"a"}}}, {}, {{"dims", dim}, {"col", 0}},
+          root_block);
+    AddOp("fetch", {{"Input", {"b"}}}, {}, {{"dims", dim}, {"col", 1}},
+          root_block);
 
     std::vector<float> vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     std::vector<float> vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0};

From 6e7666f199ab1849e37c4f2e1e2570316dcf5c04 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Sun, 8 Oct 2017 05:36:19 +0000
Subject: [PATCH 31/61] before backward

---
 paddle/framework/CMakeLists.txt   |  2 +-
 paddle/framework/executor_test.cc | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index d8812d7743..7dc9d5c804 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -44,7 +44,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto ${GLOB_OP_LIB})
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward ${GLOB_OP_LIB})
 if(WITH_GPU)
     nv_test(executor_test SRCS executor_test.cc DEPS executor)
 else()
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index e8ea09b77d..7ce472ed2f 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/backward.h"
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_desc.h"
@@ -27,6 +28,7 @@ USE_OP(gaussian_random);
 USE_OP(feed);
 USE_OP(fetch);
 USE_OP(mul);
+USE_OP(squared_l2_distance);
 
 using std::string;
 using namespace paddle::platform;
@@ -170,10 +172,16 @@ class ExecutorTesterRandom : public ::testing::Test {
           root_block);
     AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {},
           root_block);
-
-    AddOp("fetch", {{"Input", {"a_out"}}}, {},
-          {{"dims", std::vector<int>{input_dim, batch_size}}, {"col", 1}},
+    AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}},
+          {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {},
           root_block);
+
+    AppendBackward(pdesc_, {});
+    // AddOp("fetch", {{"Input", {"sub_result"}}}, {},
+    //       {{"dims", std::vector<int>{input_dim, batch_size}}, {"col", 0}},
+    //       root_block);
+    AddOp("fetch", {{"Input", {"l2_distance"}}}, {},
+          {{"dims", std::vector<int>{batch_size}}, {"col", 1}}, root_block);
   }
 
  protected:

From ba791f7b3f0b4f2b43f4391f7ccc10cdf7b0d06c Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 27 Sep 2017 23:16:35 +0800
Subject: [PATCH 32/61] Add vol2col functor and unit test

---
 paddle/operators/math/CMakeLists.txt  |   7 +-
 paddle/operators/math/vol2col.cc      | 155 +++++++++++++++++++
 paddle/operators/math/vol2col.cu      | 204 ++++++++++++++++++++++++++
 paddle/operators/math/vol2col.h       |  78 ++++++++++
 paddle/operators/math/vol2col_test.cc | 156 ++++++++++++++++++++
 5 files changed, 597 insertions(+), 3 deletions(-)
 create mode 100644 paddle/operators/math/vol2col.cc
 create mode 100644 paddle/operators/math/vol2col.cu
 create mode 100644 paddle/operators/math/vol2col.h
 create mode 100644 paddle/operators/math/vol2col_test.cc

diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 91ae3d49f1..176d357f2e 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,16 +1,17 @@
 if(WITH_GPU)
     nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
-      im2col.cu DEPS cblas device_context operator)
+      im2col.cu vol2col.cc vol2col.cu DEPS cblas device_context operator)
     nv_library(softmax_function SRCS softmax.cc softmax.cu
       DEPS operator)
     nv_library(cross_entropy_function SRCS cross_entropy.cc cross_entropy.cu
       DEPS operator)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc
-      DEPS cblas device_context operator)
+    cc_library(math_function SRCS math_function.cc im2col.cc vol2col.cc
+            DEPS cblas device_context operator)
     cc_library(softmax_function SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy_function SRCS cross_entropy.cc DEPS operator)
 endif()
 
 nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
+cc_test(vol2col_test SRCS vol2col_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc
new file mode 100644
index 0000000000..5bad2e8073
--- /dev/null
+++ b/paddle/operators/math/vol2col.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * vol = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& vol, framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    const T* vol_data = vol.data<T>();
+    T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * stride_depth - padding_depth + d_offset;
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * stride_height - padding_height + h_offset;
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * stride_width - padding_width + w_offset;
+
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            if (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) {
+              col_data[col_idx] = T(0);
+            } else {
+              int vol_idx =
+                  ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                      input_width +
+                  w_pad;
+              col_data[col_idx] = vol_data[vol_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * vol = [input_channels,input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& vol, const framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    T* vol_data = vol.data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * stride_depth - padding_depth + d_offset;
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * stride_height - padding_height + h_offset;
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * stride_width - padding_width + w_offset;
+
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx =
+                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                      input_width +
+                  w_pad;
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Vol2ColFunctor<platform::CPUPlace, float>;
+template class Vol2ColFunctor<platform::CPUPlace, double>;
+template class Col2VolFunctor<platform::CPUPlace, float>;
+template class Col2VolFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu
new file mode 100644
index 0000000000..27b11fb237
--- /dev/null
+++ b/paddle/operators/math/vol2col.cu
@@ -0,0 +1,204 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/vol2col.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+__global__ void vol2col(int num_kernels, const T* data_vol, int depth,
+                        int height, int width, int filter_depth,
+                        int filter_height, int filter_width, int stride_depth,
+                        int stride_height, int stride_width, int padding_depth,
+                        int padding_height, int padding_width, int output_detph,
+                        int output_height, int output_width, T* data_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % output_width;
+    int h_out = (index / output_width) % output_height;
+    int d_out = (index / output_width / output_height) % output_detph;
+    int channel_in = index / output_width / output_height / output_detph;
+    int channel_out = channel_in * filter_depth * filter_height * filter_width;
+    int w_in = w_out * stride_width - padding_width;
+    int h_in = h_out * stride_height - padding_height;
+    int d_in = d_out * stride_depth - padding_depth;
+
+    data_col += ((channel_out * output_detph + d_out) * output_height + h_out) *
+                    output_width +
+                w_out;
+    data_vol += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filter_depth; ++k) {
+      for (int i = 0; i < filter_height; ++i) {
+        for (int j = 0; j < filter_width; ++j) {
+          int d = d_in + k;
+          int h = h_in + i;
+          int w = w_in + j;
+          *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                       w < width)
+                          ? data_vol[(k * height + i) * width + j]
+                          : 0;
+          data_col += output_detph * output_height * output_width;
+        }
+      }
+    }
+  }
+}
+
+/*
+ * im = [input_channels,intpu_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& vol, framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+
+    int num_outputs =
+        input_channels * output_depth * output_height * output_width;
+
+    const int threads = 1024;
+    const int blocks = (num_outputs + 1024 - 1) / 1024;
+    vol2col<T><<<blocks, threads, 0,
+                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                     .stream()>>>(
+        num_outputs, vol.data<T>(), input_depth, input_height, input_width,
+        filter_depth, filter_height, filter_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        output_depth, output_height, output_width, col.data<T>());
+  }
+};
+
+template <class T>
+__global__ void col2vol(int num_kernels, const T* data_col, int depth,
+                        int height, int width, int filter_depth,
+                        int filter_height, int filter_width, int stride_depth,
+                        int stride_height, int stride_width, int padding_depth,
+                        int padding_height, int padding_width, int output_detph,
+                        int output_height, int output_width, T* data_vol) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    T src_val = 0;
+    int w = index % width + padding_width;
+    int h = (index / width) % height + padding_height;
+    int d = (index / width / height) % depth + padding_depth;
+    int c = index / width / height / depth;
+    // compute the start and end of the output
+    int w_col_start =
+        (w < filter_width) ? 0 : (w - filter_width) / stride_width + 1;
+    int w_col_end = min(w / stride_width + 1, output_width);
+    int h_col_start =
+        (h < filter_height) ? 0 : (h - filter_height) / stride_height + 1;
+    int h_col_end = min(h / stride_height + 1, output_height);
+    int d_col_start =
+        (d < filter_depth) ? 0 : (d - filter_depth) / stride_depth + 1;
+    int d_col_end = min(d / stride_depth + 1, output_detph);
+
+    int offset = (c * filter_depth * filter_height * filter_width +
+                  d * filter_width * filter_height + h * filter_width + w) *
+                 output_detph * output_height * output_width;
+
+    int coeff_d_col =
+        (1 - stride_depth * filter_width * filter_height * output_detph) *
+        output_height * output_width;
+    int coeff_h_col =
+        (1 - stride_height * filter_width * output_detph * output_height) *
+        output_width;
+    int coeff_w_col =
+        (1 - stride_width * output_detph * output_height * output_width);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          src_val += data_col[offset + d_col * coeff_d_col +
+                              h_col * coeff_h_col + w_col * coeff_w_col];
+        }
+      }
+    }
+    data_vol[index] = src_val;
+  }
+}
+
+/*
+ * im = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& vol, const framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+
+    int num_kernels = input_channels * input_depth * input_height * input_width;
+
+    const int threads = 1024;
+    const int blocks = (num_kernels + 1024 - 1) / 1024;
+
+    col2vol<T><<<blocks, threads, 0,
+                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                     .stream()>>>(
+        num_kernels, col.data<T>(), input_depth, input_height, input_width,
+        filter_depth, filter_height, filter_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        output_depth, output_height, output_width, vol.data<T>());
+  }
+};
+
+template class Vol2ColFunctor<platform::GPUPlace, float>;
+template class Vol2ColFunctor<platform::GPUPlace, double>;
+template class Col2VolFunctor<platform::GPUPlace, float>;
+template class Col2VolFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h
new file mode 100644
index 0000000000..f022365a16
--- /dev/null
+++ b/paddle/operators/math/vol2col.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * \brief Converts the feature data of four dimensions(CDHW) into a colData of
+ *        seven dimensions in the Vol2ColFunctor calculation,
+ *        And in the Col2VolFunctor calculation, it is reversed.
+ *
+ * \param volData   Vol data.
+ * \param volShape  The shape of volData,
+ *                 [input_channels, input_depth, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * The shape of colData is:
+ * [input_channels, filter_depth, filter_height, filter_width, output_depth,
+ * output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_depth * filter_height * filter_width, and the width
+ * is equal output_depth * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_depth,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_depth,
+ *      output_height,
+ *      output_width]
+ *
+ * \note The caller needs to ensure that volShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <typename Place, typename T>
+class Vol2ColFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& vol, framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const;
+};
+
+template <typename Place, typename T>
+class Col2VolFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& vol, const framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
new file mode 100644
index 0000000000..107a94511f
--- /dev/null
+++ b/paddle/operators/math/vol2col_test.cc
@@ -0,0 +1,156 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/vol2col.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+template <typename Place>
+void testVol2col() {
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor output_cfo;
+  paddle::framework::Tensor output_ocf;
+  paddle::framework::Tensor output_tmp;
+
+  auto* place = new Place();
+  paddle::platform::DeviceContext* context;
+  if (paddle::platform::is_cpu_place(*place)) {
+    context =
+        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
+  } else {
+#ifndef PADDLE_ONLY_CPU
+    context =
+        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
+#else
+    PADDLE_THROW("no GPU support");
+#endif  // PADDLE_ONLY_CPU
+  }
+
+  /**
+   * input = [[0, 1, 2,
+   *          3, 4, 5]
+   *          [6, 7, 8,
+   *          9, 10, 11]]
+   *
+   * output_cfo = [0, 1
+   *               1, 2
+   *               3, 4
+   *               4, 5
+   *               6, 7
+   *               7, 8
+   *               9, 10
+   *               10, 11]
+   *
+   * col2vol = [[0, 2, 2,
+   *             3, 8, 5]
+   *            [6, 14, 8,
+   *             9, 20, 11]]
+   *
+   */
+  int input_depth = 2;
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  int stride = 1;
+  int padding = 0;
+  int output_depth = (input_depth - filter_size + 2 * padding) / stride + 1;
+  int output_height = (input_height - filter_size + 2 * padding) / stride + 1;
+  int output_width = (input_width - filter_size + 2 * padding) / stride + 1;
+
+  // Vol2Col test
+  float* input_ptr =
+      input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
+                                    paddle::platform::CPUPlace());
+  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input_ptr, arr, 12 * sizeof(float));
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    input.CopyFrom<float>(input_tmp, *place);
+  }
+  output_cfo.mutable_data<float>({1, filter_size, filter_size, filter_size,
+                                  output_depth, output_height, output_width},
+                                 *place);
+
+  paddle::operators::math::Vol2ColFunctor<Place, float> vol2col;
+  vol2col(*context, input, output_cfo, stride, stride, stride, padding, padding,
+          padding);
+
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output_cfo.data<float>();
+  } else {
+    output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace());
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+
+  EXPECT_EQ(out_cfo_ptr[0], 0);
+  EXPECT_EQ(out_cfo_ptr[1], 1);
+  EXPECT_EQ(out_cfo_ptr[2], 1);
+  EXPECT_EQ(out_cfo_ptr[3], 2);
+  EXPECT_EQ(out_cfo_ptr[4], 3);
+  EXPECT_EQ(out_cfo_ptr[5], 4);
+  EXPECT_EQ(out_cfo_ptr[6], 4);
+  EXPECT_EQ(out_cfo_ptr[7], 5);
+  EXPECT_EQ(out_cfo_ptr[8], 6);
+  EXPECT_EQ(out_cfo_ptr[9], 7);
+  EXPECT_EQ(out_cfo_ptr[10], 7);
+  EXPECT_EQ(out_cfo_ptr[11], 8);
+  EXPECT_EQ(out_cfo_ptr[12], 9);
+  EXPECT_EQ(out_cfo_ptr[13], 10);
+  EXPECT_EQ(out_cfo_ptr[14], 10);
+  EXPECT_EQ(out_cfo_ptr[15], 11);
+
+  // Col2Vol test
+  memset(input_ptr, 0, 12 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    input.CopyFrom<float>(input_tmp, *place);
+  }
+
+  paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
+  col2vol(*context, input, output_cfo, stride, stride, stride, padding, padding,
+          padding);
+
+  float* in_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_cfo_ptr = input.data<float>();
+  } else {
+    input_tmp.CopyFrom<float>(input, paddle::platform::CPUPlace());
+    in_cfo_ptr = input_tmp.data<float>();
+  }
+
+  EXPECT_EQ(in_cfo_ptr[0], 0);
+  EXPECT_EQ(in_cfo_ptr[1], 2);
+  EXPECT_EQ(in_cfo_ptr[2], 2);
+  EXPECT_EQ(in_cfo_ptr[3], 3);
+  EXPECT_EQ(in_cfo_ptr[4], 8);
+  EXPECT_EQ(in_cfo_ptr[5], 5);
+  EXPECT_EQ(in_cfo_ptr[6], 6);
+  EXPECT_EQ(in_cfo_ptr[7], 14);
+  EXPECT_EQ(in_cfo_ptr[8], 8);
+  EXPECT_EQ(in_cfo_ptr[9], 9);
+  EXPECT_EQ(in_cfo_ptr[10], 20);
+  EXPECT_EQ(in_cfo_ptr[11], 11);
+}
+
+TEST(math, vol2col) {
+  testVol2col<paddle::platform::CPUPlace>();
+#ifndef PADDLE_ONLY_CPU
+  testVol2col<paddle::platform::GPUPlace>();
+#endif
+}

From 089cc11df48c8b29b34eda8ea19328a090d4c9f6 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Mon, 9 Oct 2017 03:30:53 +0000
Subject: [PATCH 33/61] clean up && fix #4624

---
 paddle/framework/block_desc.cc    |   6 ++
 paddle/framework/executor.cc      |  37 +++------
 paddle/framework/executor_test.cc | 129 ++++++++++++------------------
 3 files changed, 68 insertions(+), 104 deletions(-)

diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 01f50e1393..509aa235d3 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -74,6 +74,12 @@ void BlockDescBind::Sync() {
     for (auto &op_desc : ops_) {
       op_field.AddAllocated(op_desc->Proto());
     }
+    auto &var_field = *this->desc_->mutable_vars();
+    var_field.Clear();
+    var_field.Reserve(static_cast<int>(vars_.size()));
+    for (auto &var_desc : vars_) {
+      var_field.AddAllocated(var_desc.second->Proto());
+    }
     need_update_ = false;
   }
 }
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 9391e18ded..c6c9d13469 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -54,39 +54,33 @@ Executor::~Executor() {
 
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
   // TODO(tonyyang-svail):
-  //    - only runs the first block
-  //    - only runs on the first device
-  //    - test on gpu
+  //    - only runs the first block (i.e. no RNN support)
+  //    - only runs on the first device (i.e. no interdevice communication)
   auto& block = pdesc.blocks(0);
   auto& device = device_contexts_[0];
 
-  // TODO(tonyyang-svail):
-  //    - runs on a new local scope
-  // Scope& local_scope = scope->NewScope();
-
+  // Instantiate all the vars in the global scope
   for (auto& var : block.vars()) {
     scope->NewVar(var.name());
   }
 
+  Scope& local_scope = scope->NewScope();
+
   std::vector<bool> should_run = Preprocess(pdesc);
   PADDLE_ENFORCE(should_run.size() == block.ops_size());
   for (size_t i = 0; i < should_run.size(); ++i) {
     if (should_run[i]) {
+      for (auto var : block.ops(i).outputs()) {
+        for (auto argu : var.arguments()) {
+          if (local_scope.FindVar(argu) == nullptr) {
+            local_scope.NewVar(argu);
+          }
+        }
+      }
       auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i));
-      op->Run(*scope, *device);
+      op->Run(local_scope, *device);
     }
   }
-
-  // // print tensor value
-  // for (auto& var : block.vars()) {
-  //   std::cout << var.name() << std::endl;
-  //   auto v = scope->FindVar(var.name());
-  //   const LoDTensor& t = v->Get<LoDTensor>();
-  //   for (int i = 0; i < t.numel(); ++i) {
-  //     std::cout << t.data<float>()[i] << " ";
-  //   }
-  //   std::cout << std::endl;
-  // }
 }
 
 std::vector<bool> Executor::Preprocess(const ProgramDesc& pdesc) {
@@ -125,7 +119,6 @@ std::vector<bool> Executor::Preprocess(const ProgramDesc& pdesc) {
       }
     }
 
-    // TODO(tonyyang-svail): add VLOG here for debugging
     if (op_desc.type() == "fetch" || found_dependent_vars) {
       // erase its output to the dependency graph
       for (auto& var : op_desc.outputs()) {
@@ -141,13 +134,9 @@ std::vector<bool> Executor::Preprocess(const ProgramDesc& pdesc) {
         }
       }
 
-      // this op should be executed
       should_run.push_back(true);
-      LOG(INFO) << "Yes " << op_desc.type();
     } else {
-      // this op should NOT be executed
       should_run.push_back(false);
-      LOG(INFO) << "No " << op_desc.type();
     }
   }
 
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 7ce472ed2f..99f80d04e8 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/backward.h"
 #include "paddle/framework/block_desc.h"
-#include "paddle/framework/grad_op_builder.h"
+// #include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
@@ -37,68 +37,27 @@ using namespace paddle::framework;
 typedef paddle::framework::BlockDesc proto_block;
 typedef paddle::framework::OpDesc proto_op;
 
-struct SetAttrDescVisitor : public boost::static_visitor<void> {
-  explicit SetAttrDescVisitor(OpDesc::Attr* attr) : attr_(attr) {}
-  mutable OpDesc::Attr* attr_;
-  void operator()(int v) const { attr_->set_i(v); }
-  void operator()(float v) const { attr_->set_f(v); }
-  void operator()(const std::string& v) const { attr_->set_s(v); }
-  void operator()(bool b) const { attr_->set_b(b); }
-
-  void operator()(const std::vector<int>& v) const {
-    VectorToRepeated(v, attr_->mutable_ints());
-  }
-  void operator()(const std::vector<float>& v) const {
-    VectorToRepeated(v, attr_->mutable_floats());
-  }
-  void operator()(const std::vector<std::string>& v) const {
-    VectorToRepeated(v, attr_->mutable_strings());
-  }
-  void operator()(const std::vector<bool>& v) const {
-    VectorToRepeated(v, attr_->mutable_bools());
-  }
-  void operator()(BlockDesc* desc) const { attr_->set_block_idx(desc->idx()); }
-  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
-};
-
 void AddOp(const std::string& type, const VariableNameMap& inputs,
            const VariableNameMap& outputs, AttributeMap attrs,
-           proto_block* block) {
+           paddle::framework::BlockDescBind* block) {
   // insert output
   for (auto kv : outputs) {
     for (auto v : kv.second) {
-      auto var = block->add_vars();
-      var->set_name(v);
-      auto var_lt = var->mutable_lod_tensor();
-      var_lt->set_data_type(paddle::framework::DataType::FP32);
+      auto var = block->NewVar(v);
+      var->SetDataType(paddle::framework::DataType::FP32);
     }
   }
 
   // insert op
-  auto op = block->add_ops();
-  op->set_type(type);
+  auto op = block->AppendOp();
+  op->SetType(type);
   for (auto kv : inputs) {
-    auto X = op->add_inputs();
-    X->set_parameter(kv.first);
-    for (auto argu : kv.second) {
-      X->add_arguments(argu);
-    }
+    op->SetInput(kv.first, kv.second);
   }
   for (auto kv : outputs) {
-    auto X = op->add_outputs();
-    X->set_parameter(kv.first);
-    for (auto argu : kv.second) {
-      X->add_arguments(argu);
-    }
-  }
-  for (auto& attr : attrs) {
-    auto* attr_desc = op->add_attrs();
-    attr_desc->set_name(attr.first);
-    attr_desc->set_type(
-        static_cast<paddle::framework::AttrType>(attr.second.which() - 1));
-    SetAttrDescVisitor visitor(attr_desc);
-    boost::apply_visitor(visitor, attr.second);
+    op->SetOutput(kv.first, kv.second);
   }
+  op->SetAttrMap(attrs);
 }
 
 std::once_flag set_variable_flag;
@@ -146,10 +105,16 @@ class ExecutorTesterRandom : public ::testing::Test {
   virtual void SetUp() override {
     int input_dim = 5, batch_size = 2, embed_dim = 5;
 
-    // init pdesc
-    auto init_root_block = init_pdesc_.add_blocks();
-    init_root_block->set_idx(0);
-    init_root_block->set_parent_idx(-1);
+    // init pdesc -----------------------------------------
+    auto temp_init_root_block = init_pdesc_.add_blocks();
+    temp_init_root_block->set_idx(0);
+    temp_init_root_block->set_parent_idx(-1);
+
+    // wrap to BlockDescBind
+    paddle::framework::ProgramDescBind& init_program =
+        paddle::framework::ProgramDescBind::Instance(&init_pdesc_);
+    paddle::framework::BlockDescBind* init_root_block = init_program.Block(0);
+
     AddOp("gaussian_random", {}, {{"Out", {"w1"}}},
           {{"dims", std::vector<int>{input_dim, embed_dim}}}, init_root_block);
     AddOp("gaussian_random", {}, {{"Out", {"w2"}}},
@@ -160,11 +125,18 @@ class ExecutorTesterRandom : public ::testing::Test {
     AddOp("fetch", {{"Input", {"w2"}}}, {},
           {{"dims", std::vector<int>{embed_dim, input_dim}}, {"col", 1}},
           init_root_block);
+    // flush
+    init_program.Proto();
+
+    // run pdesc -----------------------------------------
+    auto temp_root_block = pdesc_.add_blocks();
+    temp_root_block->set_idx(0);
+    temp_root_block->set_parent_idx(-1);
 
-    // run pdesc
-    auto root_block = pdesc_.add_blocks();
-    root_block->set_idx(0);
-    root_block->set_parent_idx(-1);
+    // wrap to BlockDescBind
+    paddle::framework::ProgramDescBind& program =
+        paddle::framework::ProgramDescBind::Instance(&pdesc_);
+    paddle::framework::BlockDescBind* root_block = program.Block(0);
 
     AddOp("gaussian_random", {}, {{"Out", {"a"}}},
           {{"dims", std::vector<int>{batch_size, input_dim}}}, root_block);
@@ -175,13 +147,16 @@ class ExecutorTesterRandom : public ::testing::Test {
     AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}},
           {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {},
           root_block);
-
-    AppendBackward(pdesc_, {});
-    // AddOp("fetch", {{"Input", {"sub_result"}}}, {},
-    //       {{"dims", std::vector<int>{input_dim, batch_size}}, {"col", 0}},
-    //       root_block);
     AddOp("fetch", {{"Input", {"l2_distance"}}}, {},
           {{"dims", std::vector<int>{batch_size}}, {"col", 1}}, root_block);
+    // flush
+    program.Proto();
+
+    // TODO(tonyyang-svail):
+    //   - Test with Backward
+    // AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}},
+    //       {{"dims", std::vector<int>{batch_size, 1}}}, root_block);
+    // AppendBackward(program, {});
   }
 
  protected:
@@ -192,9 +167,14 @@ class ExecutorTesterRandom : public ::testing::Test {
 class ExecutorTesterFeedAndFetch : public ::testing::Test {
  public:
   virtual void SetUp() override {
-    auto root_block = pdesc_.add_blocks();
-    root_block->set_idx(0);
-    root_block->set_parent_idx(-1);
+    auto temp_root_block = pdesc_.add_blocks();
+    temp_root_block->set_idx(0);
+    temp_root_block->set_parent_idx(-1);
+
+    // wrap to BlockDescBind
+    paddle::framework::ProgramDescBind& program =
+        paddle::framework::ProgramDescBind::Instance(&pdesc_);
+    paddle::framework::BlockDescBind* root_block = program.Block(0);
 
     std::vector<int> dim{6};
 
@@ -207,6 +187,9 @@ class ExecutorTesterFeedAndFetch : public ::testing::Test {
     AddOp("fetch", {{"Input", {"b"}}}, {}, {{"dims", dim}, {"col", 1}},
           root_block);
 
+    // flush
+    program.Proto();
+
     std::vector<float> vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     std::vector<float> vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
     inputs_.push_back(vec1);
@@ -235,12 +218,6 @@ TEST_F(ExecutorTesterRandom, CPU) {
   executor->Run(pdesc_, GetGlobalScope());
   std::vector<std::vector<float>> result = get_fetch_variable<float>();
 
-  for (auto& vec : result) {
-    for (auto& num : vec) {
-      std::cout << num << " ";
-    }
-    std::cout << std::endl;
-  }
   delete executor;
 }
 
@@ -290,18 +267,10 @@ TEST_F(ExecutorTesterRandom, GPU) {
 
   Executor* executor = new Executor(places);
 
-  LOG(INFO) << "Run Init";
   executor->Run(init_pdesc_, GetGlobalScope());
-  LOG(INFO) << "Run";
   executor->Run(pdesc_, GetGlobalScope());
   std::vector<std::vector<float>> result = get_fetch_variable<float>();
 
-  for (auto& vec : result) {
-    for (auto& num : vec) {
-      std::cout << num << " ";
-    }
-    std::cout << std::endl;
-  }
   delete executor;
 }
 

From e51557130e91383afb0e54dee00710664c9bf555 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Mon, 9 Oct 2017 22:57:11 +0000
Subject: [PATCH 34/61] clean up for review

---
 paddle/framework/executor.cc      | 40 ++++++++++++++-------
 paddle/framework/executor.h       |  2 +-
 paddle/framework/executor_test.cc | 60 +++++++++++++------------------
 paddle/framework/scope.cc         |  1 +
 paddle/operators/feed_op.cc       |  1 +
 paddle/operators/fetch_op.cc      |  1 +
 paddle/platform/gpu_info.cc       |  2 +-
 7 files changed, 56 insertions(+), 51 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index c6c9d13469..3ac752388f 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/executor.h"
+
 #include <algorithm>
 #include <iostream>
 #include <memory>
 #include <set>
 #include <vector>
+
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
@@ -27,7 +29,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+const std::string kFeedOpType = "feed";
+const std::string kFetchOpType = "fetch";
+
 Executor::Executor(const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
   device_contexts_.resize(places.size());
   for (size_t i = 0; i < places.size(); i++) {
     if (platform::is_cpu_place(places[i])) {
@@ -46,9 +52,7 @@ Executor::Executor(const std::vector<platform::Place>& places) {
 
 Executor::~Executor() {
   for (auto& device_context : device_contexts_) {
-    if (device_context) {
-      delete device_context;
-    }
+    delete device_context;
   }
 }
 
@@ -56,6 +60,8 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
   // TODO(tonyyang-svail):
   //    - only runs the first block (i.e. no RNN support)
   //    - only runs on the first device (i.e. no interdevice communication)
+  //    - will change to use multiple blocks for RNN op and Cond Op
+  PADDLE_ENFORCE_GT(pdesc.blocks_size(), 0);
   auto& block = pdesc.blocks(0);
   auto& device = device_contexts_[0];
 
@@ -66,12 +72,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
 
   Scope& local_scope = scope->NewScope();
 
-  std::vector<bool> should_run = Preprocess(pdesc);
-  PADDLE_ENFORCE(should_run.size() == block.ops_size());
+  std::vector<bool> should_run = Prune(pdesc);
+  PADDLE_ENFORCE_EQ(should_run.size(), block.ops_size());
   for (size_t i = 0; i < should_run.size(); ++i) {
     if (should_run[i]) {
-      for (auto var : block.ops(i).outputs()) {
-        for (auto argu : var.arguments()) {
+      for (auto& var : block.ops(i).outputs()) {
+        for (auto& argu : var.arguments()) {
           if (local_scope.FindVar(argu) == nullptr) {
             local_scope.NewVar(argu);
           }
@@ -81,28 +87,32 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
       op->Run(local_scope, *device);
     }
   }
+
+  // TODO(tonyyang-svail):
+  //  - Destroy local_scope
 }
 
-std::vector<bool> Executor::Preprocess(const ProgramDesc& pdesc) {
+std::vector<bool> Executor::Prune(const ProgramDesc& pdesc) {
   // TODO(tonyyang-svail):
   //    - only runs the first block
+  //    - will change to use multiple blocks for RNN op and Cond Op
 
   auto& block = pdesc.blocks(0);
   auto& ops = block.ops();
 
   bool expect_feed = true;
   for (auto& op_desc : ops) {
-    PADDLE_ENFORCE(op_desc.type() != "feed" || expect_feed,
+    PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed,
                    "All FeedOps are at the beginning of the ProgramDesc");
-    expect_feed = (op_desc.type() == "feed");
+    expect_feed = (op_desc.type() == kFeedOpType);
   }
 
   bool expect_fetch = true;
   for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
     auto& op_desc = *op_iter;
-    PADDLE_ENFORCE(op_desc.type() != "fetch" || expect_fetch,
+    PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch,
                    "All FetchOps must at the end of the ProgramDesc");
-    expect_fetch = (op_desc.type() == "fetch");
+    expect_fetch = (op_desc.type() == kFetchOpType);
   }
 
   std::set<std::string> dependent_vars;
@@ -119,7 +129,7 @@ std::vector<bool> Executor::Preprocess(const ProgramDesc& pdesc) {
       }
     }
 
-    if (op_desc.type() == "fetch" || found_dependent_vars) {
+    if (op_desc.type() == kFetchOpType || found_dependent_vars) {
       // erase its output to the dependency graph
       for (auto& var : op_desc.outputs()) {
         for (auto& argu : var.arguments()) {
@@ -140,6 +150,10 @@ std::vector<bool> Executor::Preprocess(const ProgramDesc& pdesc) {
     }
   }
 
+  // TODO(tonyyang-svail):
+  //    - check this after integration of Init
+  // PADDLE_ENFORCE(dependent_vars.empty());
+
   // since we are traversing the ProgramDesc in reverse order
   // we reverse the should_run vector
   std::reverse(should_run.begin(), should_run.end());
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index 75cb5939ff..f832b0d7d6 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -46,7 +46,7 @@ class Executor {
    * @return
    *  vector<bool> Same size as ops. Indicates whether an op should be run.
    */
-  std::vector<bool> Preprocess(const ProgramDesc& pdesc);
+  std::vector<bool> Prune(const ProgramDesc& pdesc);
 
  private:
   std::vector<platform::DeviceContext*> device_contexts_;
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 99f80d04e8..f28651e809 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/executor.h"
+
+#include <memory>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/backward.h"
 #include "paddle/framework/block_desc.h"
-// #include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
@@ -34,9 +36,6 @@ using std::string;
 using namespace paddle::platform;
 using namespace paddle::framework;
 
-typedef paddle::framework::BlockDesc proto_block;
-typedef paddle::framework::OpDesc proto_op;
-
 void AddOp(const std::string& type, const VariableNameMap& inputs,
            const VariableNameMap& outputs, AttributeMap attrs,
            paddle::framework::BlockDescBind* block) {
@@ -51,10 +50,10 @@ void AddOp(const std::string& type, const VariableNameMap& inputs,
   // insert op
   auto op = block->AppendOp();
   op->SetType(type);
-  for (auto kv : inputs) {
+  for (auto& kv : inputs) {
     op->SetInput(kv.first, kv.second);
   }
-  for (auto kv : outputs) {
+  for (auto& kv : outputs) {
     op->SetOutput(kv.first, kv.second);
   }
   op->SetAttrMap(attrs);
@@ -65,11 +64,11 @@ std::once_flag set_variable_flag;
 // Tensors in feed value variable will only be in CPUPlace
 // So we can  memcpy the data from vector<T> to feed_value
 template <typename T>
-void set_feed_variable(const std::vector<std::vector<T>>& inputs) {
+void SetFeedVariable(const std::vector<std::vector<T>>& inputs) {
   typedef std::vector<paddle::framework::Tensor> FeedInputs;
   Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value");
   FeedInputs& feed_inputs = *(g_feed_value->GetMutable<FeedInputs>());
-  auto size = inputs.size();
+  size_t size = inputs.size();
   feed_inputs.resize(size);
   for (size_t i = 0; i < size; i++) {
     T* dst = feed_inputs[i].mutable_data<T>(
@@ -81,12 +80,12 @@ void set_feed_variable(const std::vector<std::vector<T>>& inputs) {
 // Tensors in fetch value variable will only be in CPUPlace
 // So we can memcpy the data from fetch_value to vector<T>
 template <typename T>
-std::vector<std::vector<T>> get_fetch_variable() {
+std::vector<std::vector<T>> GetFetchVariable() {
   typedef std::vector<paddle::framework::Tensor> FetchOutputs;
   Variable* g_fetch_value = GetGlobalScope()->FindVar("fetch_value");
   FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable<FetchOutputs>());
 
-  auto size = fetch_outputs.size();
+  size_t size = fetch_outputs.size();
   std::vector<std::vector<T>> result;
   result.reserve(size);
   for (size_t i = 0; i < size; i++) {
@@ -105,7 +104,7 @@ class ExecutorTesterRandom : public ::testing::Test {
   virtual void SetUp() override {
     int input_dim = 5, batch_size = 2, embed_dim = 5;
 
-    // init pdesc -----------------------------------------
+    // init pdesc
     auto temp_init_root_block = init_pdesc_.add_blocks();
     temp_init_root_block->set_idx(0);
     temp_init_root_block->set_parent_idx(-1);
@@ -128,7 +127,7 @@ class ExecutorTesterRandom : public ::testing::Test {
     // flush
     init_program.Proto();
 
-    // run pdesc -----------------------------------------
+    // run pdesc
     auto temp_root_block = pdesc_.add_blocks();
     temp_root_block->set_idx(0);
     temp_root_block->set_parent_idx(-1);
@@ -154,9 +153,6 @@ class ExecutorTesterRandom : public ::testing::Test {
 
     // TODO(tonyyang-svail):
     //   - Test with Backward
-    // AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}},
-    //       {{"dims", std::vector<int>{batch_size, 1}}}, root_block);
-    // AppendBackward(program, {});
   }
 
  protected:
@@ -213,12 +209,11 @@ TEST_F(ExecutorTesterRandom, CPU) {
   // "pointer being freed was not allocated" error will appear.
   paddle::memory::Used(cpu_place);
 
-  Executor* executor = new Executor(places);
+  std::unique_ptr<Executor> executor(new Executor(places));
+
   executor->Run(init_pdesc_, GetGlobalScope());
   executor->Run(pdesc_, GetGlobalScope());
-  std::vector<std::vector<float>> result = get_fetch_variable<float>();
-
-  delete executor;
+  std::vector<std::vector<float>> result = GetFetchVariable<float>();
 }
 
 TEST_F(ExecutorTesterFeedAndFetch, CPU) {
@@ -232,13 +227,12 @@ TEST_F(ExecutorTesterFeedAndFetch, CPU) {
   // "pointer being freed was not allocated" error will appear.
   paddle::memory::Used(cpu_place);
 
-  Executor* executor = new Executor(places);
+  std::unique_ptr<Executor> executor(new Executor(places));
 
-  // 3 mini-batch
-  for (int i = 0; i < 3; i++) {
-    set_feed_variable<float>(inputs_);
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    SetFeedVariable<float>(inputs_);
     executor->Run(pdesc_, GetGlobalScope());
-    std::vector<std::vector<float>> result = get_fetch_variable<float>();
+    std::vector<std::vector<float>> result = GetFetchVariable<float>();
     PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
     for (size_t i = 0; i < result.size(); ++i) {
       PADDLE_ENFORCE_EQ(result[i].size(), inputs_[i].size());
@@ -247,8 +241,6 @@ TEST_F(ExecutorTesterFeedAndFetch, CPU) {
       }
     }
   }
-
-  delete executor;
 }
 #else
 TEST_F(ExecutorTesterRandom, GPU) {
@@ -265,13 +257,11 @@ TEST_F(ExecutorTesterRandom, GPU) {
   paddle::memory::Used(CPUPlace());
   paddle::memory::Used(gpu_place);
 
-  Executor* executor = new Executor(places);
+  std::unique_ptr<Executor> executor(new Executor(places));
 
   executor->Run(init_pdesc_, GetGlobalScope());
   executor->Run(pdesc_, GetGlobalScope());
-  std::vector<std::vector<float>> result = get_fetch_variable<float>();
-
-  delete executor;
+  std::vector<std::vector<float>> result = GetFetchVariable<float>();
 }
 
 TEST_F(ExecutorTesterFeedAndFetch, GPU) {
@@ -287,13 +277,12 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) {
   paddle::memory::Used(CPUPlace());
   paddle::memory::Used(gpu_place);
 
-  Executor* executor = new Executor(places);
+  std::unique_ptr<Executor> executor(new Executor(places));
 
-  // 3 mini-batch
-  for (int i = 0; i < 3; i++) {
-    set_feed_variable<float>(inputs_);
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    SetFeedVariable<float>(inputs_);
     executor->Run(pdesc_, GetGlobalScope());
-    std::vector<std::vector<float>> result = get_fetch_variable<float>();
+    std::vector<std::vector<float>> result = GetFetchVariable<float>();
     PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
     for (size_t i = 0; i < result.size(); ++i) {
       PADDLE_ENFORCE_EQ(result[i].size(), inputs_[i].size());
@@ -302,6 +291,5 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) {
       }
     }
   }
-  delete executor;
 }
 #endif
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 2a0d9bbf33..c9e53a0d85 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/scope.h"
+
 #include <memory>  // for unique_ptr
 #include <mutex>   // for call_once
 #include "paddle/string/printf.h"
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index b9e43be966..dcd5f7fb77 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -31,6 +31,7 @@ class FeedOp : public framework::OperatorWithKernel {
 
     const FeedInputs& tensors = g_feed_variable->Get<FeedInputs>();
 
+    PADDLE_ENFORCE_GT(tensors.size(), col);
     auto in_dim = tensors[col].dims();
     ctx->SetOutputDim("Out", in_dim);
     // TODO(qijun): need to handle LodTensor later
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 7bde4953cd..5adb83144a 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -35,6 +35,7 @@ class FetchOp : public framework::OperatorWithKernel {
     }
 
     auto input_dim = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_GT(tensors->size(), col);
     (*tensors)[col].Resize(input_dim);
 
     // TODO(qijun): need to handle LodTensor later
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index aa76bb209d..0cab5ffc56 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -44,7 +44,7 @@ int GetCurrentDeviceId() {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE(id < GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   PADDLE_ENFORCE(cudaSetDevice(id),
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }

From 340d21d4ed7d8f0f2cc511b6480771965234570e Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 10 Oct 2017 00:02:47 +0000
Subject: [PATCH 35/61] Init at block[0]; Run at block[1]

---
 paddle/framework/executor.cc      | 16 ++++----
 paddle/framework/executor.h       |  4 +-
 paddle/framework/executor_test.cc | 63 +++++++++++++------------------
 3 files changed, 36 insertions(+), 47 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 3ac752388f..bbc7f77a94 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -56,13 +56,12 @@ Executor::~Executor() {
   }
 }
 
-void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
   // TODO(tonyyang-svail):
-  //    - only runs the first block (i.e. no RNN support)
   //    - only runs on the first device (i.e. no interdevice communication)
   //    - will change to use multiple blocks for RNN op and Cond Op
-  PADDLE_ENFORCE_GT(pdesc.blocks_size(), 0);
-  auto& block = pdesc.blocks(0);
+  PADDLE_ENFORCE_GT(pdesc.blocks_size(), block_id);
+  auto& block = pdesc.blocks(block_id);
   auto& device = device_contexts_[0];
 
   // Instantiate all the vars in the global scope
@@ -72,7 +71,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
 
   Scope& local_scope = scope->NewScope();
 
-  std::vector<bool> should_run = Prune(pdesc);
+  std::vector<bool> should_run = Prune(pdesc, block_id);
   PADDLE_ENFORCE_EQ(should_run.size(), block.ops_size());
   for (size_t i = 0; i < should_run.size(); ++i) {
     if (should_run[i]) {
@@ -92,12 +91,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) {
   //  - Destroy local_scope
 }
 
-std::vector<bool> Executor::Prune(const ProgramDesc& pdesc) {
+std::vector<bool> Executor::Prune(const ProgramDesc& pdesc, int block_id) {
   // TODO(tonyyang-svail):
-  //    - only runs the first block
   //    - will change to use multiple blocks for RNN op and Cond Op
 
-  auto& block = pdesc.blocks(0);
+  auto& block = pdesc.blocks(block_id);
   auto& ops = block.ops();
 
   bool expect_feed = true;
@@ -144,8 +142,10 @@ std::vector<bool> Executor::Prune(const ProgramDesc& pdesc) {
         }
       }
 
+      LOG(INFO) << "1 " << op_desc.type();
       should_run.push_back(true);
     } else {
+      LOG(INFO) << "0 " << op_desc.type();
       should_run.push_back(false);
     }
   }
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index f832b0d7d6..7fac4f4f46 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -34,7 +34,7 @@ class Executor {
    *  ProgramDesc
    *  Scope
    */
-  void Run(const ProgramDesc&, Scope*);
+  void Run(const ProgramDesc&, Scope*, int);
 
  protected:
   /* @Brief
@@ -46,7 +46,7 @@ class Executor {
    * @return
    *  vector<bool> Same size as ops. Indicates whether an op should be run.
    */
-  std::vector<bool> Prune(const ProgramDesc& pdesc);
+  std::vector<bool> Prune(const ProgramDesc& pdesc, int block_id);
 
  private:
   std::vector<platform::DeviceContext*> device_contexts_;
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index f28651e809..b64ba1c98f 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -104,50 +104,40 @@ class ExecutorTesterRandom : public ::testing::Test {
   virtual void SetUp() override {
     int input_dim = 5, batch_size = 2, embed_dim = 5;
 
-    // init pdesc
-    auto temp_init_root_block = init_pdesc_.add_blocks();
-    temp_init_root_block->set_idx(0);
-    temp_init_root_block->set_parent_idx(-1);
-
-    // wrap to BlockDescBind
-    paddle::framework::ProgramDescBind& init_program =
-        paddle::framework::ProgramDescBind::Instance(&init_pdesc_);
-    paddle::framework::BlockDescBind* init_root_block = init_program.Block(0);
+    auto temp_root_block = pdesc_.add_blocks();
+    temp_root_block->set_idx(0);
+    temp_root_block->set_parent_idx(-1);
+    paddle::framework::ProgramDescBind& program =
+        paddle::framework::ProgramDescBind::Instance(&pdesc_);
+    paddle::framework::BlockDescBind* root_block = program.Block(0);
 
+    // block[0]
     AddOp("gaussian_random", {}, {{"Out", {"w1"}}},
-          {{"dims", std::vector<int>{input_dim, embed_dim}}}, init_root_block);
+          {{"dims", std::vector<int>{input_dim, embed_dim}}}, root_block);
     AddOp("gaussian_random", {}, {{"Out", {"w2"}}},
-          {{"dims", std::vector<int>{embed_dim, input_dim}}}, init_root_block);
+          {{"dims", std::vector<int>{embed_dim, input_dim}}}, root_block);
     AddOp("fetch", {{"Input", {"w1"}}}, {},
           {{"dims", std::vector<int>{input_dim, embed_dim}}, {"col", 0}},
-          init_root_block);
+          root_block);
     AddOp("fetch", {{"Input", {"w2"}}}, {},
           {{"dims", std::vector<int>{embed_dim, input_dim}}, {"col", 1}},
-          init_root_block);
-    // flush
-    init_program.Proto();
-
-    // run pdesc
-    auto temp_root_block = pdesc_.add_blocks();
-    temp_root_block->set_idx(0);
-    temp_root_block->set_parent_idx(-1);
-
-    // wrap to BlockDescBind
-    paddle::framework::ProgramDescBind& program =
-        paddle::framework::ProgramDescBind::Instance(&pdesc_);
-    paddle::framework::BlockDescBind* root_block = program.Block(0);
+          root_block);
 
+    // block[1]
+    paddle::framework::BlockDescBind* run_block =
+        program.AppendBlock(*root_block);
     AddOp("gaussian_random", {}, {{"Out", {"a"}}},
-          {{"dims", std::vector<int>{batch_size, input_dim}}}, root_block);
+          {{"dims", std::vector<int>{batch_size, input_dim}}}, run_block);
     AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {},
-          root_block);
+          run_block);
     AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {},
-          root_block);
+          run_block);
     AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}},
           {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {},
-          root_block);
+          run_block);
     AddOp("fetch", {{"Input", {"l2_distance"}}}, {},
-          {{"dims", std::vector<int>{batch_size}}, {"col", 1}}, root_block);
+          {{"dims", std::vector<int>{batch_size}}, {"col", 1}}, run_block);
+
     // flush
     program.Proto();
 
@@ -157,7 +147,6 @@ class ExecutorTesterRandom : public ::testing::Test {
 
  protected:
   ProgramDesc pdesc_;
-  ProgramDesc init_pdesc_;
 };
 
 class ExecutorTesterFeedAndFetch : public ::testing::Test {
@@ -211,8 +200,8 @@ TEST_F(ExecutorTesterRandom, CPU) {
 
   std::unique_ptr<Executor> executor(new Executor(places));
 
-  executor->Run(init_pdesc_, GetGlobalScope());
-  executor->Run(pdesc_, GetGlobalScope());
+  executor->Run(pdesc_, GetGlobalScope(), 0);
+  executor->Run(pdesc_, GetGlobalScope(), 1);
   std::vector<std::vector<float>> result = GetFetchVariable<float>();
 }
 
@@ -231,7 +220,7 @@ TEST_F(ExecutorTesterFeedAndFetch, CPU) {
 
   for (int batch_id = 0; batch_id < 3; batch_id++) {
     SetFeedVariable<float>(inputs_);
-    executor->Run(pdesc_, GetGlobalScope());
+    executor->Run(pdesc_, GetGlobalScope(), 0);
     std::vector<std::vector<float>> result = GetFetchVariable<float>();
     PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
     for (size_t i = 0; i < result.size(); ++i) {
@@ -259,8 +248,8 @@ TEST_F(ExecutorTesterRandom, GPU) {
 
   std::unique_ptr<Executor> executor(new Executor(places));
 
-  executor->Run(init_pdesc_, GetGlobalScope());
-  executor->Run(pdesc_, GetGlobalScope());
+  executor->Run(pdesc_, GetGlobalScope(), 0);
+  executor->Run(pdesc_, GetGlobalScope(), 1);
   std::vector<std::vector<float>> result = GetFetchVariable<float>();
 }
 
@@ -281,7 +270,7 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) {
 
   for (int batch_id = 0; batch_id < 3; batch_id++) {
     SetFeedVariable<float>(inputs_);
-    executor->Run(pdesc_, GetGlobalScope());
+    executor->Run(pdesc_, GetGlobalScope(), 0);
     std::vector<std::vector<float>> result = GetFetchVariable<float>();
     PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
     for (size_t i = 0; i < result.size(); ++i) {

From 932402c16b1ad41851a307e2fcb432e674609071 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 10 Oct 2017 02:59:49 +0000
Subject: [PATCH 36/61] debug for sum

---
 paddle/framework/backward.cc      |  1 +
 paddle/framework/executor.cc      | 13 +++++-
 paddle/framework/executor_test.cc | 69 +++++++++++++++++++++----------
 paddle/operators/feed_op.cc       |  2 +-
 paddle/operators/fetch_op.cc      |  2 +-
 5 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 0a4688db9c..9a5c4e9cf0 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -378,6 +378,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
         backward_descs[dup_op[i]]->Rename(out_name, new_name);
         sum_op_inputs.emplace_back(new_name);
       }
+      LOG(INFO) << "fuck " << sum_op_inputs.size();
       std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
           "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
       pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index bbc7f77a94..ee6243a9bf 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -74,7 +74,8 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
   std::vector<bool> should_run = Prune(pdesc, block_id);
   PADDLE_ENFORCE_EQ(should_run.size(), block.ops_size());
   for (size_t i = 0; i < should_run.size(); ++i) {
-    if (should_run[i]) {
+    // if (should_run[i]) {
+    if (true) {
       for (auto& var : block.ops(i).outputs()) {
         for (auto& argu : var.arguments()) {
           if (local_scope.FindVar(argu) == nullptr) {
@@ -82,7 +83,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
           }
         }
       }
+      LOG(INFO) << block.ops(i).type();
+      if (block.ops(i).type() == "sum") {
+        LOG(INFO) << "Here";
+        for (auto& var : block.ops(i).inputs()) {
+          for (auto& argu : var.arguments()) {
+            LOG(INFO) << var.parameter() << " " << argu;
+          }
+        }
+      }
       auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i));
+      LOG(INFO) << op->DebugString();
       op->Run(local_scope, *device);
     }
   }
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index b64ba1c98f..12be79d01b 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -30,6 +30,7 @@ USE_OP(gaussian_random);
 USE_OP(feed);
 USE_OP(fetch);
 USE_OP(mul);
+USE_OP(sum);
 USE_OP(squared_l2_distance);
 
 using std::string;
@@ -104,40 +105,63 @@ class ExecutorTesterRandom : public ::testing::Test {
   virtual void SetUp() override {
     int input_dim = 5, batch_size = 2, embed_dim = 5;
 
-    auto temp_root_block = pdesc_.add_blocks();
-    temp_root_block->set_idx(0);
-    temp_root_block->set_parent_idx(-1);
-    paddle::framework::ProgramDescBind& program =
-        paddle::framework::ProgramDescBind::Instance(&pdesc_);
-    paddle::framework::BlockDescBind* root_block = program.Block(0);
+    auto temp_init_root_block = init_pdesc_.add_blocks();
+    temp_init_root_block->set_idx(0);
+    temp_init_root_block->set_parent_idx(-1);
+    paddle::framework::ProgramDescBind& init_program =
+        paddle::framework::ProgramDescBind::Instance(&init_pdesc_);
+    paddle::framework::BlockDescBind* init_root_block = init_program.Block(0);
 
-    // block[0]
     AddOp("gaussian_random", {}, {{"Out", {"w1"}}},
-          {{"dims", std::vector<int>{input_dim, embed_dim}}}, root_block);
+          {{"dims", std::vector<int>{input_dim, embed_dim}}}, init_root_block);
     AddOp("gaussian_random", {}, {{"Out", {"w2"}}},
-          {{"dims", std::vector<int>{embed_dim, input_dim}}}, root_block);
+          {{"dims", std::vector<int>{embed_dim, input_dim}}}, init_root_block);
     AddOp("fetch", {{"Input", {"w1"}}}, {},
           {{"dims", std::vector<int>{input_dim, embed_dim}}, {"col", 0}},
-          root_block);
+          init_root_block);
     AddOp("fetch", {{"Input", {"w2"}}}, {},
           {{"dims", std::vector<int>{embed_dim, input_dim}}, {"col", 1}},
-          root_block);
+          init_root_block);
+
+    // flush
+    init_program.Proto();
+
+    auto temp_root_block = pdesc_.add_blocks();
+    temp_root_block->set_idx(0);
+    temp_root_block->set_parent_idx(-1);
+    paddle::framework::ProgramDescBind& program =
+        paddle::framework::ProgramDescBind::Instance(&pdesc_);
+    paddle::framework::BlockDescBind* root_block = program.Block(0);
 
-    // block[1]
-    paddle::framework::BlockDescBind* run_block =
-        program.AppendBlock(*root_block);
     AddOp("gaussian_random", {}, {{"Out", {"a"}}},
-          {{"dims", std::vector<int>{batch_size, input_dim}}}, run_block);
+          {{"dims", std::vector<int>{batch_size, input_dim}}}, root_block);
     AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {},
-          run_block);
+          root_block);
     AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {},
-          run_block);
+          root_block);
     AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}},
           {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {},
-          run_block);
-    AddOp("fetch", {{"Input", {"l2_distance"}}}, {},
-          {{"dims", std::vector<int>{batch_size}}, {"col", 1}}, run_block);
+          root_block);
 
+    AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}},
+          {{"dims", std::vector<int>{batch_size, 1}}}, root_block);
+    AppendBackward(program, {});
+
+    program.Proto();
+
+    for (auto& op : pdesc_.blocks(0).ops()) {
+      if (op.type() == "sum") {
+        LOG(INFO) << "Here";
+        for (auto& var : op.inputs()) {
+          for (auto& argu : var.arguments()) {
+            LOG(INFO) << var.parameter() << " " << argu;
+          }
+        }
+      }
+    }
+
+    AddOp("fetch", {{"Input", {"l2_distance"}}}, {},
+          {{"dims", std::vector<int>{batch_size}}, {"col", 1}}, root_block);
     // flush
     program.Proto();
 
@@ -146,6 +170,7 @@ class ExecutorTesterRandom : public ::testing::Test {
   }
 
  protected:
+  ProgramDesc init_pdesc_;
   ProgramDesc pdesc_;
 };
 
@@ -200,8 +225,8 @@ TEST_F(ExecutorTesterRandom, CPU) {
 
   std::unique_ptr<Executor> executor(new Executor(places));
 
+  executor->Run(init_pdesc_, GetGlobalScope(), 0);
   executor->Run(pdesc_, GetGlobalScope(), 0);
-  executor->Run(pdesc_, GetGlobalScope(), 1);
   std::vector<std::vector<float>> result = GetFetchVariable<float>();
 }
 
@@ -248,8 +273,8 @@ TEST_F(ExecutorTesterRandom, GPU) {
 
   std::unique_ptr<Executor> executor(new Executor(places));
 
+  executor->Run(init_pdesc_, GetGlobalScope(), 0);
   executor->Run(pdesc_, GetGlobalScope(), 0);
-  executor->Run(pdesc_, GetGlobalScope(), 1);
   std::vector<std::vector<float>> result = GetFetchVariable<float>();
 }
 
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index dcd5f7fb77..b15bc86ae1 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -22,7 +22,7 @@ class FeedOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     typedef std::vector<framework::Tensor> FeedInputs;
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null.");
     int col = ctx->Attrs().Get<int>("col");
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 5adb83144a..7ca3762c36 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -22,7 +22,7 @@ class FetchOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     typedef std::vector<framework::Tensor> FetchOutputs;
     PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null.");
     int col = ctx->Attrs().Get<int>("col");

From 15400748ae6d21facb0b8e656b4298e1ae83df89 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 9 Oct 2017 20:42:29 -0700
Subject: [PATCH 37/61] follow comments and refine codes

---
 paddle/framework/backward.cc      |  2 +-
 paddle/framework/executor_test.cc | 44 +++++++++++++++----------------
 paddle/operators/feed_op.cc       |  6 ++---
 paddle/operators/feed_op.h        |  4 +--
 paddle/operators/fetch_op.cc      |  4 +--
 paddle/operators/fetch_op.h       |  4 +--
 6 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 9a5c4e9cf0..774d8e4918 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -378,7 +378,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
         backward_descs[dup_op[i]]->Rename(out_name, new_name);
         sum_op_inputs.emplace_back(new_name);
       }
-      LOG(INFO) << "fuck " << sum_op_inputs.size();
+      LOG(INFO) << "sum_op_inputs size " << sum_op_inputs.size();
       std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
           "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
       pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 12be79d01b..0515fb2216 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -60,15 +60,13 @@ void AddOp(const std::string& type, const VariableNameMap& inputs,
   op->SetAttrMap(attrs);
 }
 
-std::once_flag set_variable_flag;
-
 // Tensors in feed value variable will only be in CPUPlace
-// So we can  memcpy the data from vector<T> to feed_value
+// So we can memcpy the data from vector<T> to feed_value
 template <typename T>
 void SetFeedVariable(const std::vector<std::vector<T>>& inputs) {
-  typedef std::vector<paddle::framework::Tensor> FeedInputs;
   Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value");
-  FeedInputs& feed_inputs = *(g_feed_value->GetMutable<FeedInputs>());
+  auto& feed_inputs =
+      *(g_feed_value->GetMutable<std::vector<paddle::framework::Tensor>>());
   size_t size = inputs.size();
   feed_inputs.resize(size);
   for (size_t i = 0; i < size; i++) {
@@ -82,9 +80,9 @@ void SetFeedVariable(const std::vector<std::vector<T>>& inputs) {
 // So we can memcpy the data from fetch_value to vector<T>
 template <typename T>
 std::vector<std::vector<T>> GetFetchVariable() {
-  typedef std::vector<paddle::framework::Tensor> FetchOutputs;
   Variable* g_fetch_value = GetGlobalScope()->FindVar("fetch_value");
-  FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable<FetchOutputs>());
+  auto& fetch_outputs =
+      *(g_fetch_value->GetMutable<std::vector<paddle::framework::Tensor>>());
 
   size_t size = fetch_outputs.size();
   std::vector<std::vector<T>> result;
@@ -143,22 +141,22 @@ class ExecutorTesterRandom : public ::testing::Test {
           {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {},
           root_block);
 
-    AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}},
-          {{"dims", std::vector<int>{batch_size, 1}}}, root_block);
-    AppendBackward(program, {});
-
-    program.Proto();
-
-    for (auto& op : pdesc_.blocks(0).ops()) {
-      if (op.type() == "sum") {
-        LOG(INFO) << "Here";
-        for (auto& var : op.inputs()) {
-          for (auto& argu : var.arguments()) {
-            LOG(INFO) << var.parameter() << " " << argu;
-          }
-        }
-      }
-    }
+    // AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}},
+    //       {{"dims", std::vector<int>{batch_size, 1}}}, root_block);
+    // AppendBackward(program, {});
+
+    // program.Proto();
+
+    // for (auto& op : pdesc_.blocks(0).ops()) {
+    //   if (op.type() == "sum") {
+    //     LOG(INFO) << "Here";
+    //     for (auto& var : op.inputs()) {
+    //       for (auto& argu : var.arguments()) {
+    //         LOG(INFO) << var.parameter() << " " << argu;
+    //       }
+    //     }
+    //   }
+    // }
 
     AddOp("fetch", {{"Input", {"l2_distance"}}}, {},
           {{"dims", std::vector<int>{batch_size}}, {"col", 1}}, root_block);
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index b15bc86ae1..29e128ce7e 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -23,15 +23,15 @@ class FeedOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    typedef std::vector<framework::Tensor> FeedInputs;
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null.");
     int col = ctx->Attrs().Get<int>("col");
     framework::Variable* g_feed_variable =
         framework::GetGlobalScope()->FindVar("feed_value");
 
-    const FeedInputs& tensors = g_feed_variable->Get<FeedInputs>();
+    const auto& tensors =
+        g_feed_variable->Get<std::vector<framework::Tensor>>();
 
-    PADDLE_ENFORCE_GT(tensors.size(), col);
+    PADDLE_ENFORCE_GT(tensors.size(), static_cast<size_t>(col));
     auto in_dim = tensors[col].dims();
     ctx->SetOutputDim("Out", in_dim);
     // TODO(qijun): need to handle LodTensor later
diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h
index de8ec6ff61..96e3bf52bd 100644
--- a/paddle/operators/feed_op.h
+++ b/paddle/operators/feed_op.h
@@ -23,13 +23,13 @@ template <typename T>
 class FeedKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    typedef std::vector<framework::Tensor> FeedInputs;
     framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
     framework::Variable* g_feed_variable =
         framework::GetGlobalScope()->FindVar("feed_value");
     int col = ctx.template Attr<int>("col");
-    const FeedInputs& tensors = g_feed_variable->Get<FeedInputs>();
+    const auto& tensors =
+        g_feed_variable->Get<std::vector<framework::Tensor>>();
     out->CopyFrom<T>(tensors[col], ctx.GetPlace());
   }
 };
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 7ca3762c36..77e3450a73 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -23,13 +23,13 @@ class FetchOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    typedef std::vector<framework::Tensor> FetchOutputs;
     PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null.");
     int col = ctx->Attrs().Get<int>("col");
     framework::Variable* g_fetch_variable =
         framework::GetGlobalScope()->FindVar("fetch_value");
 
-    FetchOutputs* tensors = g_fetch_variable->GetMutable<FetchOutputs>();
+    auto* tensors =
+        g_fetch_variable->GetMutable<std::vector<framework::Tensor>>();
     if (tensors->size() < static_cast<size_t>(col + 1)) {
       tensors->resize(col + 1);
     }
diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h
index 3bec9c9974..fd98552055 100644
--- a/paddle/operators/fetch_op.h
+++ b/paddle/operators/fetch_op.h
@@ -23,12 +23,12 @@ template <typename T>
 class FetchKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    typedef std::vector<framework::Tensor> FetchOutputs;
     const framework::Tensor* input = ctx.Input<framework::Tensor>("Input");
     int col = ctx.template Attr<int>("col");
     framework::Variable* g_fetch_variable =
         framework::GetGlobalScope()->FindVar("fetch_value");
-    FetchOutputs* tensors = g_fetch_variable->GetMutable<FetchOutputs>();
+    auto* tensors =
+        g_fetch_variable->GetMutable<std::vector<framework::Tensor>>();
     (*tensors)[col].mutable_data<T>(platform::CPUPlace());
     (*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace());
   }

From e3161bb61a4686d96588bc1eb86c3edc0e26e6ee Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 10 Oct 2017 04:49:45 +0000
Subject: [PATCH 38/61] pass simple backward

---
 paddle/framework/executor_test.cc | 51 ++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 0515fb2216..9f8a6f8593 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -32,6 +32,8 @@ USE_OP(fetch);
 USE_OP(mul);
 USE_OP(sum);
 USE_OP(squared_l2_distance);
+USE_OP(fill_constant);
+USE_OP(sgd);
 
 using std::string;
 using namespace paddle::platform;
@@ -124,6 +126,7 @@ class ExecutorTesterRandom : public ::testing::Test {
     // flush
     init_program.Proto();
 
+    // run block
     auto temp_root_block = pdesc_.add_blocks();
     temp_root_block->set_idx(0);
     temp_root_block->set_parent_idx(-1);
@@ -131,6 +134,7 @@ class ExecutorTesterRandom : public ::testing::Test {
         paddle::framework::ProgramDescBind::Instance(&pdesc_);
     paddle::framework::BlockDescBind* root_block = program.Block(0);
 
+    // forward
     AddOp("gaussian_random", {}, {{"Out", {"a"}}},
           {{"dims", std::vector<int>{batch_size, input_dim}}}, root_block);
     AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {},
@@ -141,30 +145,33 @@ class ExecutorTesterRandom : public ::testing::Test {
           {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {},
           root_block);
 
-    // AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}},
-    //       {{"dims", std::vector<int>{batch_size, 1}}}, root_block);
-    // AppendBackward(program, {});
-
-    // program.Proto();
-
-    // for (auto& op : pdesc_.blocks(0).ops()) {
-    //   if (op.type() == "sum") {
-    //     LOG(INFO) << "Here";
-    //     for (auto& var : op.inputs()) {
-    //       for (auto& argu : var.arguments()) {
-    //         LOG(INFO) << var.parameter() << " " << argu;
-    //       }
-    //     }
-    //   }
-    // }
-
-    AddOp("fetch", {{"Input", {"l2_distance"}}}, {},
-          {{"dims", std::vector<int>{batch_size}}, {"col", 1}}, root_block);
+    // backward
+    AddOp("fill_constant", {}, {{"Out", {"l2_distance@GRAD"}}},
+          {{"shape", std::vector<int>{batch_size, 1}}, {"value", float(1.0)}},
+          root_block);
+    AppendBackward(program, {});
+
+    // update
+    AddOp("fill_constant", {}, {{"Out", {"learning_rate"}}},
+          {{"shape", std::vector<int>{1}}, {"value", float(1.0)}}, root_block);
+    AddOp("sgd", {{"Param", {"w1"}},
+                  {"LearningRate", {"learning_rate"}},
+                  {"Grad", {"w1@GRAD"}}},
+          {{"ParamOut", {"w1"}}}, {}, root_block);
+    AddOp("sgd", {{"Param", {"w2"}},
+                  {"LearningRate", {"learning_rate"}},
+                  {"Grad", {"w2@GRAD"}}},
+          {{"ParamOut", {"w2"}}}, {}, root_block);
+
+    AddOp("fetch", {{"Input", {"w1"}}}, {},
+          {{"dims", std::vector<int>{input_dim, embed_dim}}, {"col", 0}},
+          root_block);
+    AddOp("fetch", {{"Input", {"w2"}}}, {},
+          {{"dims", std::vector<int>{embed_dim, input_dim}}, {"col", 1}},
+          root_block);
+
     // flush
     program.Proto();
-
-    // TODO(tonyyang-svail):
-    //   - Test with Backward
   }
 
  protected:

From 2fc7fc7a18fb8cbb78d380caf51947097138597c Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 10 Oct 2017 05:33:11 +0000
Subject: [PATCH 39/61] pass multiple forward backward

---
 paddle/framework/executor_test.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 9f8a6f8593..259205f7c1 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -279,8 +279,10 @@ TEST_F(ExecutorTesterRandom, GPU) {
   std::unique_ptr<Executor> executor(new Executor(places));
 
   executor->Run(init_pdesc_, GetGlobalScope(), 0);
-  executor->Run(pdesc_, GetGlobalScope(), 0);
-  std::vector<std::vector<float>> result = GetFetchVariable<float>();
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    executor->Run(pdesc_, GetGlobalScope(), 0);
+    std::vector<std::vector<float>> result = GetFetchVariable<float>();
+  }
 }
 
 TEST_F(ExecutorTesterFeedAndFetch, GPU) {

From 975a51294e20c122e7143a232261d4fd49ac5643 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 9 Oct 2017 23:55:35 -0700
Subject: [PATCH 40/61] infer feed operator output variable shape with dims
 attribute

---
 paddle/operators/feed_op.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 29e128ce7e..1d65c2bb46 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -32,8 +32,12 @@ class FeedOp : public framework::OperatorWithKernel {
         g_feed_variable->Get<std::vector<framework::Tensor>>();
 
     PADDLE_ENFORCE_GT(tensors.size(), static_cast<size_t>(col));
-    auto in_dim = tensors[col].dims();
-    ctx->SetOutputDim("Out", in_dim);
+
+    auto& shape = ctx->Attrs().Get<std::vector<int>>("dims");
+    std::vector<int64_t> shape_int64(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    ctx->SetOutputDim("Out", framework::make_ddim(shape_int64));
     // TODO(qijun): need to handle LodTensor later
   }
 

From 67edd04a2f37c6bee5642d1d75be5ca5eb250b4b Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Tue, 10 Oct 2017 21:29:18 +0800
Subject: [PATCH 41/61] fix doc

---
 paddle/operators/pool_op.cc            | 75 ++++++++++++++++----------
 paddle/operators/pool_with_index_op.cc |  7 +--
 2 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index ba3b5ed207..acc7e66c08 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -40,8 +40,6 @@ class PoolOp : public framework::OperatorWithKernel {
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
-    PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg",
-                   "pooling_type should be 'max' or 'avg'");
     PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                    "Pooling intput should be 4-D or 5-D");
 
@@ -52,13 +50,11 @@ class PoolOp : public framework::OperatorWithKernel {
     }
 
     PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
-                   "Input size and Pooling size should be consistent.");
-    PADDLE_ENFORCE(ksize.size() == 2 || ksize.size() == 3,
-                   "Pooling size should be 2 elements. or 3 elements.");
+                   "Input size and pooling size should be consistent.");
     PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                      "strides size and pooling size should be the same.");
+                      "Strides size and pooling size should be the same.");
     PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
-                      "paddings size and pooling size should be the same.");
+                      "Paddings size and pooling size should be the same.");
 
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
     for (size_t i = 0; i < ksize.size(); ++i) {
@@ -75,10 +71,9 @@ class PoolOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "X(Input) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input@Grad of Pooling should not be null.");
+                   "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
@@ -94,17 +89,22 @@ class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "number of channels, H and W is the height and width of feature.");
     AddOutput("Out",
               "The output tensor of pooling operator."
-              "The format of output tensor is also NCHW.");
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
 
     AddAttr<std::string>("poolingType",
                          "PoolingType of pooling operator."
                          "Str constant equal to 'max' or 'avg'.")
         .InEnum({"max", "avg"});
+
     AddAttr<std::vector<int>>(
         "ksize",
-        "Pooling size(depth, height, width) of pooling operator."
+        "The pooling size(height, width) of pooling operator."
         "If globalPooling = true, ksize is ignored and need not be "
-        "specified.");  // TODO(Add checker)
+        "specified.");  // TODO(Chengduo): Add checker. (Currently,
+                        // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
         "globalPooling",
         "Whether to use the globalPooling."
@@ -114,15 +114,22 @@ class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "Strides(height, width) of pooling operator."
-                              "Default {1,1}")
-        .SetDefault({1, 1});  // TODO(Add checker)
+                              "Default {1,1}.")
+        .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                              // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>("paddings",
                               "Paddings(height, width) of pooling operator."
                               "Default {0,0}.")
-        .SetDefault({0, 0});  // TODO(Add checker)
+        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                              // TypedAttrChecker don't support vector type.)
+
     AddComment(R"DOC(
 The pooling2d operation calculates the output based on
 the input, poolingType and ksize, strides, paddings parameters.
+Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the
+number of channels, H and W is the height and width of feature.
+Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
 )DOC");
   }
 };
@@ -131,25 +138,30 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   Pool3dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "The input tensor of pooling operator. "
-             "The format of input tensor is NCDHW. Where N is batch size, C is "
-             "the "
-             "number of channels, D, H and W is the depth, height and width of "
-             "feature.");
+    AddInput(
+        "X",
+        "The input tensor of pooling operator. "
+        "The format of input tensor is NCDHW. Where N is batch size, C is "
+        "the number of channels, D, H and W is the depth, height and width of "
+        "feature.");
     AddOutput("Out",
               "The output tensor of pooling operator."
-              "The format of output tensor is also NCDHW.");
+              "The format of output tensor is also NCDHW."
+              "Where N is batch size, C is "
+              "the number of channels, D, H and W is the depth, height and "
+              "width of feature.");
 
     AddAttr<std::string>("poolingType",
                          "PoolingType of pooling operator."
-                         "str constant equal to 'max' or 'avg'.")
+                         "Str constant equal to 'max' or 'avg'.")
         .InEnum({"max", "avg"});
+
     AddAttr<std::vector<int>>(
         "ksize",
-        "Pooling size(depth, height, width) of pooling operator."
+        "The pooling size(depth, height, width) of pooling operator."
         "If globalPooling = true, ksize is ignored and need not be "
-        "specified.");  // TODO(Add checker)
+        "specified.");  // TODO(Chengduo): Add checker. (Currently,
+                        // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
         "globalPooling",
         "Whether to use the globalPooling."
@@ -161,15 +173,22 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
         "strides",
         "Strides(depth, height, width) of pooling operator."
         "Default {1,1,1}.")
-        .SetDefault({1, 1, 1});  // TODO(Add checker)
+        .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                                 // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
         "Paddings(depth, height, width) of pooling operator."
         "Default {0,0,0}.")
-        .SetDefault({0, 0, 0});  // TODO(Add checker)
+        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                                 // TypedAttrChecker don't support vector type.)
+
     AddComment(R"DOC(
 The pooling3d operation calculates the output based on
 the input, poolingType and ksize, strides, paddings parameters.
+Input(X) and output(Out) are in NCDHW format. Where N is batch
+size, C is the number of channels, D, H and W is the depth, height and
+width of feature. Parameters(ksize, strides, paddings) are three elements.
+These three elements represent depth, height and width, respectively.
 )DOC");
   }
 };
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index ab933a3400..b49d486d7c 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -28,7 +28,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "X(Input) of Pooling should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -52,7 +52,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     }
 
     PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
-                   "Intput size and pooling size should be consistent.");
+                   "Input size and pooling size should be consistent.");
     PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
                       "Strides size and pooling size should be the same.");
     PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
@@ -73,7 +73,8 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                    "Input(X@GRAD) should not be null.");

From 6db476ed89b64a91e07ed7e13344645d27c9f1fb Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Tue, 10 Oct 2017 21:35:39 +0800
Subject: [PATCH 42/61] Separate the declarations and implementation of the
 PoolOp and PoolMaker class in order to reuse in pool_cudnn

---
 paddle/operators/pool_op.cc | 290 +++++++++++++++++-------------------
 paddle/operators/pool_op.h  |  28 ++++
 2 files changed, 164 insertions(+), 154 deletions(-)

diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index acc7e66c08..25fd01844b 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -22,108 +22,94 @@ int OutputSizePool(int input_size, int filter_size, int padding, int stride) {
   return output_size;
 }
 
-class PoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "X(Input) of Pooling should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Out(Output) of Pooling should not be null.");
-
-    auto in_x_dims = ctx->GetInputDim("X");
-
-    std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
-    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-
-    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                   "Pooling intput should be 4-D or 5-D");
-
-    if (ctx->Attrs().Get<bool>("globalPooling")) {
-      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-      for (size_t i = 0; i < ksize.size(); ++i)
-        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-    }
-
-    PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
-                   "Input size and pooling size should be consistent.");
-    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                      "Strides size and pooling size should be the same.");
-    PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
-                      "Paddings size and pooling size should be the same.");
-
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      output_shape.push_back(
-          OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "Out(Output) of Pooling should not be null.");
+
+  auto in_x_dims = ctx->GetInputDim("X");
+
+  std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
+  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                 "Pooling intput should be 4-D or 5-D");
+
+  if (ctx->Attrs().Get<bool>("globalPooling")) {
+    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i)
+      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
   }
-};
-
-class PoolOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+
+  PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                 "Input size and pooling size should be consistent.");
+  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                    "Strides size and pooling size should be the same.");
+  PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                    "Paddings size and pooling size should be the same.");
+
+  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+  for (size_t i = 0; i < ksize.size(); ++i) {
+    output_shape.push_back(
+        OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
   }
-};
-
-class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Pool2dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "The input tensor of pooling operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddOutput("Out",
-              "The output tensor of pooling operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is "
-              "the number of channels, H and W is the height and "
-              "width of feature.");
-
-    AddAttr<std::string>("poolingType",
-                         "PoolingType of pooling operator."
-                         "Str constant equal to 'max' or 'avg'.")
-        .InEnum({"max", "avg"});
-
-    AddAttr<std::vector<int>>(
-        "ksize",
-        "The pooling size(height, width) of pooling operator."
-        "If globalPooling = true, ksize is ignored and need not be "
-        "specified.");  // TODO(Chengduo): Add checker. (Currently,
-                        // TypedAttrChecker don't support vector type.)
-    AddAttr<bool>(
-        "globalPooling",
-        "Whether to use the globalPooling."
-        "Bool constant equal to false or true."
-        "Default false."
-        "If globalPooling = true, ksize is ignored and need not be specified.")
-        .SetDefault(false);
-    AddAttr<std::vector<int>>("strides",
-                              "Strides(height, width) of pooling operator."
-                              "Default {1,1}.")
-        .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
-                              // TypedAttrChecker don't support vector type.)
-    AddAttr<std::vector<int>>("paddings",
-                              "Paddings(height, width) of pooling operator."
-                              "Default {0,0}.")
-        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
-                              // TypedAttrChecker don't support vector type.)
-
-    AddComment(R"DOC(
+  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+}
+
+void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                 "Input(X@GRAD) should not be null.");
+  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+}
+
+Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "X",
+      "The input tensor of pooling operator. "
+      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "number of channels, H and W is the height and width of feature.");
+  AddOutput("Out",
+            "The output tensor of pooling operator."
+            "The format of output tensor is also NCHW."
+            "Where N is batch size, C is "
+            "the number of channels, H and W is the height and "
+            "width of feature.");
+
+  AddAttr<std::string>("poolingType",
+                       "PoolingType of pooling operator."
+                       "Str constant equal to 'max' or 'avg'.")
+      .InEnum({"max", "avg"});
+
+  AddAttr<std::vector<int>>(
+      "ksize",
+      "The pooling size(height, width) of pooling operator."
+      "If globalPooling = true, ksize is ignored and need not be "
+      "specified.");  // TODO(Chengduo): Add checker. (Currently,
+                      // TypedAttrChecker don't support vector type.)
+  AddAttr<bool>(
+      "globalPooling",
+      "Whether to use the globalPooling."
+      "Bool constant equal to false or true."
+      "Default false."
+      "If globalPooling = true, ksize is ignored and need not be specified.")
+      .SetDefault(false);
+  AddAttr<std::vector<int>>("strides",
+                            "Strides(height, width) of pooling operator."
+                            "Default {1,1}.")
+      .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                            // TypedAttrChecker don't support vector type.)
+  AddAttr<std::vector<int>>("paddings",
+                            "Paddings(height, width) of pooling operator."
+                            "Default {0,0}.")
+      .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                            // TypedAttrChecker don't support vector type.)
+
+  AddComment(R"DOC(
 The pooling2d operation calculates the output based on
 the input, poolingType and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the
@@ -131,58 +117,55 @@ number of channels, H and W is the height and width of feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 )DOC");
-  }
-};
-
-class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Pool3dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "The input tensor of pooling operator. "
-        "The format of input tensor is NCDHW. Where N is batch size, C is "
-        "the number of channels, D, H and W is the depth, height and width of "
-        "feature.");
-    AddOutput("Out",
-              "The output tensor of pooling operator."
-              "The format of output tensor is also NCDHW."
-              "Where N is batch size, C is "
-              "the number of channels, D, H and W is the depth, height and "
-              "width of feature.");
-
-    AddAttr<std::string>("poolingType",
-                         "PoolingType of pooling operator."
-                         "Str constant equal to 'max' or 'avg'.")
-        .InEnum({"max", "avg"});
-
-    AddAttr<std::vector<int>>(
-        "ksize",
-        "The pooling size(depth, height, width) of pooling operator."
-        "If globalPooling = true, ksize is ignored and need not be "
-        "specified.");  // TODO(Chengduo): Add checker. (Currently,
-                        // TypedAttrChecker don't support vector type.)
-    AddAttr<bool>(
-        "globalPooling",
-        "Whether to use the globalPooling."
-        "Bool constant equal to false or true."
-        "Default false."
-        "If globalPooling = true, ksize is ignored and need not be specified.")
-        .SetDefault(false);
-    AddAttr<std::vector<int>>(
-        "strides",
-        "Strides(depth, height, width) of pooling operator."
-        "Default {1,1,1}.")
-        .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
-                                 // TypedAttrChecker don't support vector type.)
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "Paddings(depth, height, width) of pooling operator."
-        "Default {0,0,0}.")
-        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
-                                 // TypedAttrChecker don't support vector type.)
-
-    AddComment(R"DOC(
+}
+
+Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "X",
+      "The input tensor of pooling operator. "
+      "The format of input tensor is NCDHW. Where N is batch size, C is "
+      "the number of channels, D, H and W is the depth, height and width of "
+      "feature.");
+  AddOutput("Out",
+            "The output tensor of pooling operator."
+            "The format of output tensor is also NCDHW."
+            "Where N is batch size, C is "
+            "the number of channels, D, H and W is the depth, height and "
+            "width of feature.");
+
+  AddAttr<std::string>("poolingType",
+                       "PoolingType of pooling operator."
+                       "Str constant equal to 'max' or 'avg'.")
+      .InEnum({"max", "avg"});
+
+  AddAttr<std::vector<int>>(
+      "ksize",
+      "The pooling size(depth, height, width) of pooling operator."
+      "If globalPooling = true, ksize is ignored and need not be "
+      "specified.");  // TODO(Chengduo): Add checker. (Currently,
+                      // TypedAttrChecker don't support vector type.)
+  AddAttr<bool>(
+      "globalPooling",
+      "Whether to use the globalPooling."
+      "Bool constant equal to false or true."
+      "Default false."
+      "If globalPooling = true, ksize is ignored and need not be specified.")
+      .SetDefault(false);
+  AddAttr<std::vector<int>>("strides",
+                            "Strides(depth, height, width) of pooling operator."
+                            "Default {1,1,1}.")
+      .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                               // TypedAttrChecker don't support vector type.)
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "Paddings(depth, height, width) of pooling operator."
+      "Default {0,0,0}.")
+      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                               // TypedAttrChecker don't support vector type.)
+
+  AddComment(R"DOC(
 The pooling3d operation calculates the output based on
 the input, poolingType and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCDHW format. Where N is batch
@@ -190,8 +173,7 @@ size, C is the number of channels, D, H and W is the depth, height and
 width of feature. Parameters(ksize, strides, paddings) are three elements.
 These three elements represent depth, height and width, respectively.
 )DOC");
-  }
-};
+}
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index c2bc358def..e5016d573d 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -24,6 +24,34 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+class PoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class PoolOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool2dOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+
+class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool3dOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+
 template <typename Place, typename T>
 class PoolKernel : public framework::OpKernel<T> {
  public:

From a308ff29af714be50e321c65fdcd88729a505ebe Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 10 Oct 2017 10:25:01 -0700
Subject: [PATCH 43/61] make infershape of feedop and fetchop compatible with
 compile-time design

---
 paddle/framework/executor_test.cc | 22 ++++++----------------
 paddle/operators/feed_op.cc       | 13 ++-----------
 paddle/operators/feed_op.h        |  3 ++-
 paddle/operators/fetch_op.cc      | 20 ++------------------
 paddle/operators/fetch_op.h       |  8 +++++++-
 5 files changed, 19 insertions(+), 47 deletions(-)

diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 259205f7c1..0710eb5779 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -116,12 +116,8 @@ class ExecutorTesterRandom : public ::testing::Test {
           {{"dims", std::vector<int>{input_dim, embed_dim}}}, init_root_block);
     AddOp("gaussian_random", {}, {{"Out", {"w2"}}},
           {{"dims", std::vector<int>{embed_dim, input_dim}}}, init_root_block);
-    AddOp("fetch", {{"Input", {"w1"}}}, {},
-          {{"dims", std::vector<int>{input_dim, embed_dim}}, {"col", 0}},
-          init_root_block);
-    AddOp("fetch", {{"Input", {"w2"}}}, {},
-          {{"dims", std::vector<int>{embed_dim, input_dim}}, {"col", 1}},
-          init_root_block);
+    AddOp("fetch", {{"Input", {"w1"}}}, {}, {{"col", 0}}, init_root_block);
+    AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"col", 1}}, init_root_block);
 
     // flush
     init_program.Proto();
@@ -163,12 +159,8 @@ class ExecutorTesterRandom : public ::testing::Test {
                   {"Grad", {"w2@GRAD"}}},
           {{"ParamOut", {"w2"}}}, {}, root_block);
 
-    AddOp("fetch", {{"Input", {"w1"}}}, {},
-          {{"dims", std::vector<int>{input_dim, embed_dim}}, {"col", 0}},
-          root_block);
-    AddOp("fetch", {{"Input", {"w2"}}}, {},
-          {{"dims", std::vector<int>{embed_dim, input_dim}}, {"col", 1}},
-          root_block);
+    AddOp("fetch", {{"Input", {"w1"}}}, {}, {{"col", 0}}, root_block);
+    AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"col", 1}}, root_block);
 
     // flush
     program.Proto();
@@ -197,10 +189,8 @@ class ExecutorTesterFeedAndFetch : public ::testing::Test {
           root_block);
     AddOp("feed", {}, {{"Out", {"b"}}}, {{"dims", dim}, {"col", 1}},
           root_block);
-    AddOp("fetch", {{"Input", {"a"}}}, {}, {{"dims", dim}, {"col", 0}},
-          root_block);
-    AddOp("fetch", {{"Input", {"b"}}}, {}, {{"dims", dim}, {"col", 1}},
-          root_block);
+    AddOp("fetch", {{"Input", {"a"}}}, {}, {{"col", 0}}, root_block);
+    AddOp("fetch", {{"Input", {"b"}}}, {}, {{"col", 1}}, root_block);
 
     // flush
     program.Proto();
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 1d65c2bb46..fa325bb282 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -24,15 +24,6 @@ class FeedOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null.");
-    int col = ctx->Attrs().Get<int>("col");
-    framework::Variable* g_feed_variable =
-        framework::GetGlobalScope()->FindVar("feed_value");
-
-    const auto& tensors =
-        g_feed_variable->Get<std::vector<framework::Tensor>>();
-
-    PADDLE_ENFORCE_GT(tensors.size(), static_cast<size_t>(col));
-
     auto& shape = ctx->Attrs().Get<std::vector<int>>("dims");
     std::vector<int64_t> shape_int64(shape.size(), 0);
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
@@ -43,7 +34,7 @@ class FeedOp : public framework::OperatorWithKernel {
 
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
-    return static_cast<framework::DataType>(Attr<int>("data_type"));
+    return static_cast<framework::DataType>(Attr<int>("dataType"));
   }
 };
 
@@ -51,7 +42,7 @@ class FeedOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   FeedOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("data_type", "output data type")
+    AddAttr<int>("dataType", "output data type")
         .SetDefault(framework::DataType::FP32);
     AddAttr<int>("col", "The col in global feed variable").SetDefault(0);
     AddAttr<std::vector<int>>("dims", "The dimension of feed tensor.");
diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h
index 96e3bf52bd..47344e309c 100644
--- a/paddle/operators/feed_op.h
+++ b/paddle/operators/feed_op.h
@@ -27,9 +27,10 @@ class FeedKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(ctx.GetPlace());
     framework::Variable* g_feed_variable =
         framework::GetGlobalScope()->FindVar("feed_value");
-    int col = ctx.template Attr<int>("col");
     const auto& tensors =
         g_feed_variable->Get<std::vector<framework::Tensor>>();
+    int col = ctx.template Attr<int>("col");
+    PADDLE_ENFORCE_GT(tensors.size(), static_cast<size_t>(col));
     out->CopyFrom<T>(tensors[col], ctx.GetPlace());
   }
 };
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 77e3450a73..90737c8c55 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -24,26 +24,11 @@ class FetchOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null.");
-    int col = ctx->Attrs().Get<int>("col");
-    framework::Variable* g_fetch_variable =
-        framework::GetGlobalScope()->FindVar("fetch_value");
-
-    auto* tensors =
-        g_fetch_variable->GetMutable<std::vector<framework::Tensor>>();
-    if (tensors->size() < static_cast<size_t>(col + 1)) {
-      tensors->resize(col + 1);
-    }
-
-    auto input_dim = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_GT(tensors->size(), col);
-    (*tensors)[col].Resize(input_dim);
-
-    // TODO(qijun): need to handle LodTensor later
   }
 
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
-    return static_cast<framework::DataType>(Attr<int>("data_type"));
+    return static_cast<framework::DataType>(Attr<int>("dataType"));
   }
 };
 
@@ -51,10 +36,9 @@ class FetchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   FetchOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("data_type", "output data type")
+    AddAttr<int>("dataType", "output data type")
         .SetDefault(framework::DataType::FP32);
     AddAttr<int>("col", "The col in global fetch variable").SetDefault(0);
-    AddAttr<std::vector<int>>("dims", "The dimension of fetch tensor.");
     AddInput("Input", "The output of fetch op.");
     AddComment(R"DOC(Fetch data to global fetch variable)DOC");
   }
diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h
index fd98552055..6fee8b0589 100644
--- a/paddle/operators/fetch_op.h
+++ b/paddle/operators/fetch_op.h
@@ -24,13 +24,19 @@ class FetchKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const framework::Tensor* input = ctx.Input<framework::Tensor>("Input");
-    int col = ctx.template Attr<int>("col");
     framework::Variable* g_fetch_variable =
         framework::GetGlobalScope()->FindVar("fetch_value");
     auto* tensors =
         g_fetch_variable->GetMutable<std::vector<framework::Tensor>>();
+    int col = ctx.template Attr<int>("col");
+    if (tensors->size() < static_cast<size_t>(col + 1)) {
+      tensors->resize(col + 1);
+    }
+    PADDLE_ENFORCE_GT(tensors->size(), static_cast<size_t>(col));
+    (*tensors)[col].Resize(input->dims());
     (*tensors)[col].mutable_data<T>(platform::CPUPlace());
     (*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace());
+    // TODO(qijun): need to handle LodTensor later
   }
 };
 

From 3f9e247a7358ae7824c3ce63a7231b54b31944a3 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 10 Oct 2017 18:53:54 +0000
Subject: [PATCH 44/61] set variable support dim

---
 paddle/framework/executor.cc      |  3 +--
 paddle/framework/executor_test.cc | 30 ++++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index ee6243a9bf..f4cc37cfa6 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -74,8 +74,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
   std::vector<bool> should_run = Prune(pdesc, block_id);
   PADDLE_ENFORCE_EQ(should_run.size(), block.ops_size());
   for (size_t i = 0; i < should_run.size(); ++i) {
-    // if (should_run[i]) {
-    if (true) {
+    if (should_run[i]) {
       for (auto& var : block.ops(i).outputs()) {
         for (auto& argu : var.arguments()) {
           if (local_scope.FindVar(argu) == nullptr) {
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 0710eb5779..ce8b599e0e 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -65,15 +65,15 @@ void AddOp(const std::string& type, const VariableNameMap& inputs,
 // Tensors in feed value variable will only be in CPUPlace
 // So we can memcpy the data from vector<T> to feed_value
 template <typename T>
-void SetFeedVariable(const std::vector<std::vector<T>>& inputs) {
+void SetFeedVariable(const std::vector<std::vector<T>>& inputs,
+                     const std::vector<std::vector<int64_t>>& dims) {
   Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value");
   auto& feed_inputs =
       *(g_feed_value->GetMutable<std::vector<paddle::framework::Tensor>>());
   size_t size = inputs.size();
   feed_inputs.resize(size);
   for (size_t i = 0; i < size; i++) {
-    T* dst = feed_inputs[i].mutable_data<T>(
-        make_ddim({static_cast<int64_t>(inputs[i].size())}), CPUPlace());
+    T* dst = feed_inputs[i].mutable_data<T>(make_ddim(dims[i]), CPUPlace());
     memcpy(dst, inputs[i].data(), inputs[i].size() * sizeof(T));
   }
 }
@@ -103,7 +103,7 @@ std::vector<std::vector<T>> GetFetchVariable() {
 class ExecutorTesterRandom : public ::testing::Test {
  public:
   virtual void SetUp() override {
-    int input_dim = 5, batch_size = 2, embed_dim = 5;
+    int input_dim = 3, batch_size = 2, embed_dim = 5;
 
     auto temp_init_root_block = init_pdesc_.add_blocks();
     temp_init_root_block->set_idx(0);
@@ -130,9 +130,16 @@ class ExecutorTesterRandom : public ::testing::Test {
         paddle::framework::ProgramDescBind::Instance(&pdesc_);
     paddle::framework::BlockDescBind* root_block = program.Block(0);
 
+    // feed data
+    inputs_.push_back({1.0, 2.0, 3.0, 4.0, 5.0, 6.0});
+    dims_.push_back({batch_size, input_dim});
+    AddOp("feed", {}, {{"Out", {"a"}}},
+          {{"dims", std::vector<int>{batch_size, input_dim}}, {"col", 0}},
+          root_block);
+
     // forward
-    AddOp("gaussian_random", {}, {{"Out", {"a"}}},
-          {{"dims", std::vector<int>{batch_size, input_dim}}}, root_block);
+    // AddOp("gaussian_random", {}, {{"Out", {"a"}}},
+    //       {{"dims", std::vector<int>{batch_size, input_dim}}}, root_block);
     AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {},
           root_block);
     AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {},
@@ -161,6 +168,7 @@ class ExecutorTesterRandom : public ::testing::Test {
 
     AddOp("fetch", {{"Input", {"w1"}}}, {}, {{"col", 0}}, root_block);
     AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"col", 1}}, root_block);
+    AddOp("fetch", {{"Input", {"l2_distance"}}}, {}, {{"col", 0}}, root_block);
 
     // flush
     program.Proto();
@@ -169,6 +177,8 @@ class ExecutorTesterRandom : public ::testing::Test {
  protected:
   ProgramDesc init_pdesc_;
   ProgramDesc pdesc_;
+  std::vector<std::vector<float>> inputs_;
+  std::vector<std::vector<int64_t>> dims_;
 };
 
 class ExecutorTesterFeedAndFetch : public ::testing::Test {
@@ -199,11 +209,14 @@ class ExecutorTesterFeedAndFetch : public ::testing::Test {
     std::vector<float> vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
     inputs_.push_back(vec1);
     inputs_.push_back(vec2);
+    dims_.push_back({static_cast<int64_t>(vec1.size())});
+    dims_.push_back({static_cast<int64_t>(vec2.size())});
   }
 
  protected:
   ProgramDesc pdesc_;
   std::vector<std::vector<float>> inputs_;
+  std::vector<std::vector<int64_t>> dims_;
 };
 
 #ifndef PADDLE_WITH_CUDA
@@ -239,7 +252,7 @@ TEST_F(ExecutorTesterFeedAndFetch, CPU) {
   std::unique_ptr<Executor> executor(new Executor(places));
 
   for (int batch_id = 0; batch_id < 3; batch_id++) {
-    SetFeedVariable<float>(inputs_);
+    SetFeedVariable<float>(inputs_, dims_);
     executor->Run(pdesc_, GetGlobalScope(), 0);
     std::vector<std::vector<float>> result = GetFetchVariable<float>();
     PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
@@ -270,6 +283,7 @@ TEST_F(ExecutorTesterRandom, GPU) {
 
   executor->Run(init_pdesc_, GetGlobalScope(), 0);
   for (int batch_id = 0; batch_id < 3; batch_id++) {
+    SetFeedVariable<float>(inputs_, dims_);
     executor->Run(pdesc_, GetGlobalScope(), 0);
     std::vector<std::vector<float>> result = GetFetchVariable<float>();
   }
@@ -291,7 +305,7 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) {
   std::unique_ptr<Executor> executor(new Executor(places));
 
   for (int batch_id = 0; batch_id < 3; batch_id++) {
-    SetFeedVariable<float>(inputs_);
+    SetFeedVariable<float>(inputs_, dims_);
     executor->Run(pdesc_, GetGlobalScope(), 0);
     std::vector<std::vector<float>> result = GetFetchVariable<float>();
     PADDLE_ENFORCE_EQ(result.size(), inputs_.size());

From 293a7d1e75d14a744852523383bdbef1663887be Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 10 Oct 2017 18:55:16 +0000
Subject: [PATCH 45/61] add feed infershape todo

---
 paddle/operators/feed_op.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h
index 47344e309c..e406d22209 100644
--- a/paddle/operators/feed_op.h
+++ b/paddle/operators/feed_op.h
@@ -31,6 +31,9 @@ class FeedKernel : public framework::OpKernel<T> {
         g_feed_variable->Get<std::vector<framework::Tensor>>();
     int col = ctx.template Attr<int>("col");
     PADDLE_ENFORCE_GT(tensors.size(), static_cast<size_t>(col));
+    // TODO(qijun):
+    //   check tensors[col].dims() with attribute,
+    //   except the first dimenson.
     out->CopyFrom<T>(tensors[col], ctx.GetPlace());
   }
 };

From 062ff4d77b61fc72b0654064911b193714cfb18f Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 10 Oct 2017 19:07:21 +0000
Subject: [PATCH 46/61] clean up

---
 paddle/framework/executor.cc      | 14 +-------------
 paddle/framework/executor_test.cc |  8 +++-----
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index f4cc37cfa6..def1d1fd06 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -72,7 +72,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
   Scope& local_scope = scope->NewScope();
 
   std::vector<bool> should_run = Prune(pdesc, block_id);
-  PADDLE_ENFORCE_EQ(should_run.size(), block.ops_size());
+  PADDLE_ENFORCE_EQ(should_run.size(), static_cast<size_t>(block.ops_size()));
   for (size_t i = 0; i < should_run.size(); ++i) {
     if (should_run[i]) {
       for (auto& var : block.ops(i).outputs()) {
@@ -82,17 +82,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
           }
         }
       }
-      LOG(INFO) << block.ops(i).type();
-      if (block.ops(i).type() == "sum") {
-        LOG(INFO) << "Here";
-        for (auto& var : block.ops(i).inputs()) {
-          for (auto& argu : var.arguments()) {
-            LOG(INFO) << var.parameter() << " " << argu;
-          }
-        }
-      }
       auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i));
-      LOG(INFO) << op->DebugString();
       op->Run(local_scope, *device);
     }
   }
@@ -152,10 +142,8 @@ std::vector<bool> Executor::Prune(const ProgramDesc& pdesc, int block_id) {
         }
       }
 
-      LOG(INFO) << "1 " << op_desc.type();
       should_run.push_back(true);
     } else {
-      LOG(INFO) << "0 " << op_desc.type();
       should_run.push_back(false);
     }
   }
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index ce8b599e0e..5ad5b98e7b 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -131,15 +131,13 @@ class ExecutorTesterRandom : public ::testing::Test {
     paddle::framework::BlockDescBind* root_block = program.Block(0);
 
     // feed data
-    inputs_.push_back({1.0, 2.0, 3.0, 4.0, 5.0, 6.0});
+    inputs_.push_back({1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
     dims_.push_back({batch_size, input_dim});
     AddOp("feed", {}, {{"Out", {"a"}}},
           {{"dims", std::vector<int>{batch_size, input_dim}}, {"col", 0}},
           root_block);
 
     // forward
-    // AddOp("gaussian_random", {}, {{"Out", {"a"}}},
-    //       {{"dims", std::vector<int>{batch_size, input_dim}}}, root_block);
     AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {},
           root_block);
     AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {},
@@ -156,7 +154,8 @@ class ExecutorTesterRandom : public ::testing::Test {
 
     // update
     AddOp("fill_constant", {}, {{"Out", {"learning_rate"}}},
-          {{"shape", std::vector<int>{1}}, {"value", float(1.0)}}, root_block);
+          {{"shape", std::vector<int>{1}}, {"value", float(0.001)}},
+          root_block);
     AddOp("sgd", {{"Param", {"w1"}},
                   {"LearningRate", {"learning_rate"}},
                   {"Grad", {"w1@GRAD"}}},
@@ -285,7 +284,6 @@ TEST_F(ExecutorTesterRandom, GPU) {
   for (int batch_id = 0; batch_id < 3; batch_id++) {
     SetFeedVariable<float>(inputs_, dims_);
     executor->Run(pdesc_, GetGlobalScope(), 0);
-    std::vector<std::vector<float>> result = GetFetchVariable<float>();
   }
 }
 

From 2e7cd201a4337f49ce07de8cde11c3b8dd90f9ab Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 10 Oct 2017 21:30:03 +0000
Subject: [PATCH 47/61] remove log in backward

---
 paddle/framework/backward.cc      | 1 -
 paddle/framework/executor_test.cc | 1 -
 2 files changed, 2 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 774d8e4918..0a4688db9c 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -378,7 +378,6 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
         backward_descs[dup_op[i]]->Rename(out_name, new_name);
         sum_op_inputs.emplace_back(new_name);
       }
-      LOG(INFO) << "sum_op_inputs size " << sum_op_inputs.size();
       std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
           "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
       pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 5ad5b98e7b..1cd7270240 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -35,7 +35,6 @@ USE_OP(squared_l2_distance);
 USE_OP(fill_constant);
 USE_OP(sgd);
 
-using std::string;
 using namespace paddle::platform;
 using namespace paddle::framework;
 

From 436ea50d5fc8867848892fc53b7f82aa59ae3b41 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 10 Oct 2017 14:31:47 -0700
Subject: [PATCH 48/61] follow comments

---
 paddle/framework/executor.cc      |  4 +++-
 paddle/framework/executor_test.cc | 17 +++++++++--------
 paddle/framework/scope.cc         |  4 ++--
 paddle/framework/scope.h          |  2 +-
 paddle/operators/feed_op.h        |  2 +-
 paddle/operators/fetch_op.h       |  2 +-
 6 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index def1d1fd06..1db5c878d6 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -44,7 +44,9 @@ Executor::Executor(const std::vector<platform::Place>& places) {
       device_contexts_[i] = new platform::CUDADeviceContext(
           boost::get<platform::GPUPlace>(places[i]));
 #else
-      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+      PADDLE_THROW(
+          "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
 #endif
     }
   }
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 5ad5b98e7b..f36284b528 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -67,7 +67,7 @@ void AddOp(const std::string& type, const VariableNameMap& inputs,
 template <typename T>
 void SetFeedVariable(const std::vector<std::vector<T>>& inputs,
                      const std::vector<std::vector<int64_t>>& dims) {
-  Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value");
+  Variable* g_feed_value = GetGlobalScope().FindVar("feed_value");
   auto& feed_inputs =
       *(g_feed_value->GetMutable<std::vector<paddle::framework::Tensor>>());
   size_t size = inputs.size();
@@ -82,7 +82,7 @@ void SetFeedVariable(const std::vector<std::vector<T>>& inputs,
 // So we can memcpy the data from fetch_value to vector<T>
 template <typename T>
 std::vector<std::vector<T>> GetFetchVariable() {
-  Variable* g_fetch_value = GetGlobalScope()->FindVar("fetch_value");
+  Variable* g_fetch_value = GetGlobalScope().FindVar("fetch_value");
   auto& fetch_outputs =
       *(g_fetch_value->GetMutable<std::vector<paddle::framework::Tensor>>());
 
@@ -232,8 +232,9 @@ TEST_F(ExecutorTesterRandom, CPU) {
 
   std::unique_ptr<Executor> executor(new Executor(places));
 
-  executor->Run(init_pdesc_, GetGlobalScope(), 0);
-  executor->Run(pdesc_, GetGlobalScope(), 0);
+  executor->Run(init_pdesc_, &GetGlobalScope(), 0);
+  SetFeedVariable<float>(inputs_, dims_);
+  executor->Run(pdesc_, &GetGlobalScope(), 0);
   std::vector<std::vector<float>> result = GetFetchVariable<float>();
 }
 
@@ -252,7 +253,7 @@ TEST_F(ExecutorTesterFeedAndFetch, CPU) {
 
   for (int batch_id = 0; batch_id < 3; batch_id++) {
     SetFeedVariable<float>(inputs_, dims_);
-    executor->Run(pdesc_, GetGlobalScope(), 0);
+    executor->Run(pdesc_, &GetGlobalScope(), 0);
     std::vector<std::vector<float>> result = GetFetchVariable<float>();
     PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
     for (size_t i = 0; i < result.size(); ++i) {
@@ -280,10 +281,10 @@ TEST_F(ExecutorTesterRandom, GPU) {
 
   std::unique_ptr<Executor> executor(new Executor(places));
 
-  executor->Run(init_pdesc_, GetGlobalScope(), 0);
+  executor->Run(init_pdesc_, &GetGlobalScope(), 0);
   for (int batch_id = 0; batch_id < 3; batch_id++) {
     SetFeedVariable<float>(inputs_, dims_);
-    executor->Run(pdesc_, GetGlobalScope(), 0);
+    executor->Run(pdesc_, &GetGlobalScope(), 0);
   }
 }
 
@@ -304,7 +305,7 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) {
 
   for (int batch_id = 0; batch_id < 3; batch_id++) {
     SetFeedVariable<float>(inputs_, dims_);
-    executor->Run(pdesc_, GetGlobalScope(), 0);
+    executor->Run(pdesc_, &GetGlobalScope(), 0);
     std::vector<std::vector<float>> result = GetFetchVariable<float>();
     PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
     for (size_t i = 0; i < result.size(); ++i) {
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index c9e53a0d85..5821bac928 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -67,14 +67,14 @@ void Scope::DropKids() {
 
 std::once_flag feed_variable_flag;
 
-framework::Scope* GetGlobalScope() {
+framework::Scope& GetGlobalScope() {
   static std::unique_ptr<framework::Scope> g_scope{nullptr};
   std::call_once(feed_variable_flag, [&]() {
     g_scope.reset(new framework::Scope());
     g_scope->NewVar("feed_value");
     g_scope->NewVar("fetch_value");
   });
-  return g_scope.get();
+  return *(g_scope.get());
 }
 
 }  // namespace framework
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 319d291efe..a8cfb107c2 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -73,7 +73,7 @@ class Scope {
   DISABLE_COPY_AND_ASSIGN(Scope);
 };
 
-framework::Scope* GetGlobalScope();
+framework::Scope& GetGlobalScope();
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h
index e406d22209..9d8158299f 100644
--- a/paddle/operators/feed_op.h
+++ b/paddle/operators/feed_op.h
@@ -26,7 +26,7 @@ class FeedKernel : public framework::OpKernel<T> {
     framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
     framework::Variable* g_feed_variable =
-        framework::GetGlobalScope()->FindVar("feed_value");
+        framework::GetGlobalScope().FindVar("feed_value");
     const auto& tensors =
         g_feed_variable->Get<std::vector<framework::Tensor>>();
     int col = ctx.template Attr<int>("col");
diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h
index 6fee8b0589..eb9c3a7b59 100644
--- a/paddle/operators/fetch_op.h
+++ b/paddle/operators/fetch_op.h
@@ -25,7 +25,7 @@ class FetchKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     const framework::Tensor* input = ctx.Input<framework::Tensor>("Input");
     framework::Variable* g_fetch_variable =
-        framework::GetGlobalScope()->FindVar("fetch_value");
+        framework::GetGlobalScope().FindVar("fetch_value");
     auto* tensors =
         g_fetch_variable->GetMutable<std::vector<framework::Tensor>>();
     int col = ctx.template Attr<int>("col");

From a528a9717ec5880f271b9d216cb5532cee9d4504 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 10 Oct 2017 21:32:03 +0000
Subject: [PATCH 49/61] remove prune as member function to function

---
 paddle/framework/executor.cc | 120 +++++++++++++++++------------------
 paddle/framework/executor.h  |  23 ++++---
 2 files changed, 71 insertions(+), 72 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index def1d1fd06..3c35102ff9 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -32,66 +32,7 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
 
-Executor::Executor(const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  device_contexts_.resize(places.size());
-  for (size_t i = 0; i < places.size(); i++) {
-    if (platform::is_cpu_place(places[i])) {
-      device_contexts_[i] = new platform::CPUDeviceContext(
-          boost::get<platform::CPUPlace>(places[i]));
-    } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-      device_contexts_[i] = new platform::CUDADeviceContext(
-          boost::get<platform::GPUPlace>(places[i]));
-#else
-      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
-#endif
-    }
-  }
-}
-
-Executor::~Executor() {
-  for (auto& device_context : device_contexts_) {
-    delete device_context;
-  }
-}
-
-void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
-  // TODO(tonyyang-svail):
-  //    - only runs on the first device (i.e. no interdevice communication)
-  //    - will change to use multiple blocks for RNN op and Cond Op
-  PADDLE_ENFORCE_GT(pdesc.blocks_size(), block_id);
-  auto& block = pdesc.blocks(block_id);
-  auto& device = device_contexts_[0];
-
-  // Instantiate all the vars in the global scope
-  for (auto& var : block.vars()) {
-    scope->NewVar(var.name());
-  }
-
-  Scope& local_scope = scope->NewScope();
-
-  std::vector<bool> should_run = Prune(pdesc, block_id);
-  PADDLE_ENFORCE_EQ(should_run.size(), static_cast<size_t>(block.ops_size()));
-  for (size_t i = 0; i < should_run.size(); ++i) {
-    if (should_run[i]) {
-      for (auto& var : block.ops(i).outputs()) {
-        for (auto& argu : var.arguments()) {
-          if (local_scope.FindVar(argu) == nullptr) {
-            local_scope.NewVar(argu);
-          }
-        }
-      }
-      auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i));
-      op->Run(local_scope, *device);
-    }
-  }
-
-  // TODO(tonyyang-svail):
-  //  - Destroy local_scope
-}
-
-std::vector<bool> Executor::Prune(const ProgramDesc& pdesc, int block_id) {
+std::vector<bool> Prune(const ProgramDesc& pdesc, int block_id) {
   // TODO(tonyyang-svail):
   //    - will change to use multiple blocks for RNN op and Cond Op
 
@@ -159,5 +100,64 @@ std::vector<bool> Executor::Prune(const ProgramDesc& pdesc, int block_id) {
   return should_run;
 }
 
+Executor::Executor(const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  device_contexts_.resize(places.size());
+  for (size_t i = 0; i < places.size(); i++) {
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_[i] = new platform::CPUDeviceContext(
+          boost::get<platform::CPUPlace>(places[i]));
+    } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_[i] = new platform::CUDADeviceContext(
+          boost::get<platform::GPUPlace>(places[i]));
+#else
+      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+#endif
+    }
+  }
+}
+
+Executor::~Executor() {
+  for (auto& device_context : device_contexts_) {
+    delete device_context;
+  }
+}
+
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
+  // TODO(tonyyang-svail):
+  //    - only runs on the first device (i.e. no interdevice communication)
+  //    - will change to use multiple blocks for RNN op and Cond Op
+  PADDLE_ENFORCE_GT(pdesc.blocks_size(), block_id);
+  auto& block = pdesc.blocks(block_id);
+  auto& device = device_contexts_[0];
+
+  // Instantiate all the vars in the global scope
+  for (auto& var : block.vars()) {
+    scope->NewVar(var.name());
+  }
+
+  Scope& local_scope = scope->NewScope();
+
+  std::vector<bool> should_run = Prune(pdesc, block_id);
+  PADDLE_ENFORCE_EQ(should_run.size(), static_cast<size_t>(block.ops_size()));
+  for (size_t i = 0; i < should_run.size(); ++i) {
+    if (should_run[i]) {
+      for (auto& var : block.ops(i).outputs()) {
+        for (auto& argu : var.arguments()) {
+          if (local_scope.FindVar(argu) == nullptr) {
+            local_scope.NewVar(argu);
+          }
+        }
+      }
+      auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i));
+      op->Run(local_scope, *device);
+    }
+  }
+
+  // TODO(tonyyang-svail):
+  //  - Destroy local_scope
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index 7fac4f4f46..4e3bc2c0a5 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -36,21 +36,20 @@ class Executor {
    */
   void Run(const ProgramDesc&, Scope*, int);
 
- protected:
-  /* @Brief
-   * Pruning the graph
-   *
-   * @param
-   *  ProgramDesc
-   *
-   * @return
-   *  vector<bool> Same size as ops. Indicates whether an op should be run.
-   */
-  std::vector<bool> Prune(const ProgramDesc& pdesc, int block_id);
-
  private:
   std::vector<platform::DeviceContext*> device_contexts_;
 };
 
+/* @Brief
+ * Pruning the graph
+ *
+ * @param
+ *  ProgramDesc
+ *
+ * @return
+ *  vector<bool> Same size as ops. Indicates whether an op should be run.
+ */
+std::vector<bool> Prune(const ProgramDesc& pdesc, int block_id);
+
 }  // namespace framework
 }  // namespace paddle

From 434949ca2d23a2fec5c3b4ab8e6bcb0ea18921fc Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Tue, 10 Oct 2017 21:51:43 +0000
Subject: [PATCH 50/61] clean up for merge

---
 paddle/framework/executor.cc | 61 +-----------------------------------
 1 file changed, 1 insertion(+), 60 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 69c21d7457..886e9ab33e 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -93,7 +93,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
   //  - Destroy local_scope
 }
 
-std::vector<bool> Executor::Prune(const ProgramDesc& pdesc, int block_id) {
+std::vector<bool> Prune(const ProgramDesc& pdesc, int block_id) {
   // TODO(tonyyang-svail):
   //    - will change to use multiple blocks for RNN op and Cond Op
 
@@ -161,64 +161,5 @@ std::vector<bool> Executor::Prune(const ProgramDesc& pdesc, int block_id) {
   return should_run;
 }
 
-Executor::Executor(const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  device_contexts_.resize(places.size());
-  for (size_t i = 0; i < places.size(); i++) {
-    if (platform::is_cpu_place(places[i])) {
-      device_contexts_[i] = new platform::CPUDeviceContext(
-          boost::get<platform::CPUPlace>(places[i]));
-    } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-      device_contexts_[i] = new platform::CUDADeviceContext(
-          boost::get<platform::GPUPlace>(places[i]));
-#else
-      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
-#endif
-    }
-  }
-}
-
-Executor::~Executor() {
-  for (auto& device_context : device_contexts_) {
-    delete device_context;
-  }
-}
-
-void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
-  // TODO(tonyyang-svail):
-  //    - only runs on the first device (i.e. no interdevice communication)
-  //    - will change to use multiple blocks for RNN op and Cond Op
-  PADDLE_ENFORCE_GT(pdesc.blocks_size(), block_id);
-  auto& block = pdesc.blocks(block_id);
-  auto& device = device_contexts_[0];
-
-  // Instantiate all the vars in the global scope
-  for (auto& var : block.vars()) {
-    scope->NewVar(var.name());
-  }
-
-  Scope& local_scope = scope->NewScope();
-
-  std::vector<bool> should_run = Prune(pdesc, block_id);
-  PADDLE_ENFORCE_EQ(should_run.size(), static_cast<size_t>(block.ops_size()));
-  for (size_t i = 0; i < should_run.size(); ++i) {
-    if (should_run[i]) {
-      for (auto& var : block.ops(i).outputs()) {
-        for (auto& argu : var.arguments()) {
-          if (local_scope.FindVar(argu) == nullptr) {
-            local_scope.NewVar(argu);
-          }
-        }
-      }
-      auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i));
-      op->Run(local_scope, *device);
-    }
-  }
-
-  // TODO(tonyyang-svail):
-  //  - Destroy local_scope
-}
-
 }  // namespace framework
 }  // namespace paddle

From 72d3d814b5a62617d41e49cd2c6e662ad613ad78 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 11 Oct 2017 09:32:29 +0800
Subject: [PATCH 51/61] fix math/CMakeLists.txt

---
 paddle/operators/CMakeLists.txt      | 6 +++++-
 paddle/operators/math/CMakeLists.txt | 6 ++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index d132c1813e..89b1895a3d 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -112,7 +112,9 @@ set(DEPS_OPS
     cond_op
     cross_entropy_op
     softmax_with_cross_entropy_op
-    sum_op)
+    sum_op
+    pool_op
+    pool_with_index_op)
 
 
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
@@ -121,6 +123,8 @@ op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(sum_op DEPS net_op)
+op_library(pool_op DEPS pooling)
+op_library(pool_with_index_op DEPS pooling)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index a0ceb029e3..6e2611af7b 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,13 +1,15 @@
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu pooling.cc pooling.cu DEPS cblas device_context operator)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator)
     nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
+    nv_library(pooling SRCS pooling.cc pooling.cu DEPS operator)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator)
+    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
+    cc_library(pooling SRCS pooling.cc DEPS operator)
 endif()
 
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)

From c85d777f879e128a3a9b00ddfc243879a747f5da Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Tue, 10 Oct 2017 22:35:55 +0800
Subject: [PATCH 52/61] follow comments

---
 paddle/operators/math/CMakeLists.txt  |  8 ++++--
 paddle/operators/math/vol2col.cc      |  2 +-
 paddle/operators/math/vol2col_test.cc | 40 +++++++--------------------
 3 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index d6e8373210..575e89eed8 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,15 +1,17 @@
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu vol2col.cc vol2col.cu pooling.cc pooling.cu DEPS cblas device_context operator)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu pooling.cc pooling.cu DEPS cblas device_context operator)
     nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
+    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS cblas device_context operator)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc vol2col.cc pooling.cc DEPS cblas device_context operator)
+    cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator)
     cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
+    cc_library(vol2col SRCS vol2col.cc DEPS cblas device_context operator)
 
 endif()
 
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
-cc_test(vol2col_test SRCS vol2col_test.cc DEPS math_function tensor)
+cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc
index 5bad2e8073..e9718a0473 100644
--- a/paddle/operators/math/vol2col.cc
+++ b/paddle/operators/math/vol2col.cc
@@ -67,7 +67,7 @@ class Vol2ColFunctor<platform::CPUPlace, T> {
                 ((c * output_depth + d) * output_height + h) * output_width + w;
             if (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) {
-              col_data[col_idx] = T(0);
+              col_data[col_idx] = static_cast<T>(0);
             } else {
               int vol_idx =
                   ((c_in * input_depth + d_pad) * input_height + h_pad) *
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index 107a94511f..e3c599da87 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -30,12 +30,12 @@ void testVol2col() {
     context =
         new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
   } else {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
     context =
         new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
 #else
     PADDLE_THROW("no GPU support");
-#endif  // PADDLE_ONLY_CPU
+#endif  // PADDLE_WITH_CUDA
   }
 
   /**
@@ -89,6 +89,7 @@ void testVol2col() {
   vol2col(*context, input, output_cfo, stride, stride, stride, padding, padding,
           padding);
 
+  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
   float* out_cfo_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
@@ -97,24 +98,12 @@ void testVol2col() {
     out_cfo_ptr = output_tmp.data<float>();
   }
 
-  EXPECT_EQ(out_cfo_ptr[0], 0);
-  EXPECT_EQ(out_cfo_ptr[1], 1);
-  EXPECT_EQ(out_cfo_ptr[2], 1);
-  EXPECT_EQ(out_cfo_ptr[3], 2);
-  EXPECT_EQ(out_cfo_ptr[4], 3);
-  EXPECT_EQ(out_cfo_ptr[5], 4);
-  EXPECT_EQ(out_cfo_ptr[6], 4);
-  EXPECT_EQ(out_cfo_ptr[7], 5);
-  EXPECT_EQ(out_cfo_ptr[8], 6);
-  EXPECT_EQ(out_cfo_ptr[9], 7);
-  EXPECT_EQ(out_cfo_ptr[10], 7);
-  EXPECT_EQ(out_cfo_ptr[11], 8);
-  EXPECT_EQ(out_cfo_ptr[12], 9);
-  EXPECT_EQ(out_cfo_ptr[13], 10);
-  EXPECT_EQ(out_cfo_ptr[14], 10);
-  EXPECT_EQ(out_cfo_ptr[15], 11);
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
+  }
 
   // Col2Vol test
+  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
   memset(input_ptr, 0, 12 * sizeof(float));
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
@@ -134,18 +123,9 @@ void testVol2col() {
     in_cfo_ptr = input_tmp.data<float>();
   }
 
-  EXPECT_EQ(in_cfo_ptr[0], 0);
-  EXPECT_EQ(in_cfo_ptr[1], 2);
-  EXPECT_EQ(in_cfo_ptr[2], 2);
-  EXPECT_EQ(in_cfo_ptr[3], 3);
-  EXPECT_EQ(in_cfo_ptr[4], 8);
-  EXPECT_EQ(in_cfo_ptr[5], 5);
-  EXPECT_EQ(in_cfo_ptr[6], 6);
-  EXPECT_EQ(in_cfo_ptr[7], 14);
-  EXPECT_EQ(in_cfo_ptr[8], 8);
-  EXPECT_EQ(in_cfo_ptr[9], 9);
-  EXPECT_EQ(in_cfo_ptr[10], 20);
-  EXPECT_EQ(in_cfo_ptr[11], 11);
+  for (int i = 0; i < 12; ++i) {
+    EXPECT_EQ(in_cfo_ptr[i], col_2_vol[i]);
+  }
 }
 
 TEST(math, vol2col) {

From 1d41a6d45a917010f21088d1917040565649f2a5 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 11 Oct 2017 10:11:45 +0800
Subject: [PATCH 53/61] update paddle/operators/math/CMakeLists.txt

---
 paddle/operators/math/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 575e89eed8..d32924db85 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -3,13 +3,13 @@ if(WITH_GPU)
     nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
-    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS cblas device_context operator)
+    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context operator)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator)
     cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
-    cc_library(vol2col SRCS vol2col.cc DEPS cblas device_context operator)
+    cc_library(vol2col SRCS vol2col.cc DEPS device_context operator)
 
 endif()
 

From 1397e17f6b1fe1088af6ab3117eb7b6c5f4adea3 Mon Sep 17 00:00:00 2001
From: kavyasrinet <kavyasrinet@baidu.com>
Date: Tue, 10 Oct 2017 19:50:34 -0700
Subject: [PATCH 54/61] Implemented the hardShrink activation (#4653)

* Implemented the hardShrink activation

* Fixing the unit test
---
 paddle/operators/activation_op.cc             | 21 ++++++++++
 paddle/operators/activation_op.h              | 38 +++++++++++++++++--
 .../v2/framework/tests/test_activation_op.py  | 20 ++++++++++
 3 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index a6bb738af3..61a201b6cd 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -137,6 +137,24 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+template <typename AttrType>
+class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HardShrinkOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of HardShrink operator");
+    AddOutput("Y", "Output of HardShrink operator");
+    AddComment(
+        "HardShrink activation operator, "
+        "hard_shrink(x) = x if x > lambda"
+        "hard_shrink(x) = x if x < -lambda"
+        "hard_shrink(x) = 0 otherwise");
+    AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
+        .SetDefault(static_cast<AttrType>(0.5));
+  }
+};
+
 class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -357,6 +375,9 @@ REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
 REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
             ops::ActivationOpGrad);
 
+REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker<float>,
+            hard_shrink_grad, ops::ActivationOpGrad);
+
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)        \
   REGISTER_OP_CPU_KERNEL(                                                      \
       act_type,                                                                \
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 70d5a62052..29f159bbae 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -199,6 +199,39 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct HardShrinkFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    auto temp1 = (x < (threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > threshold).template cast<T>().eval();
+    y.device(d) = x * (temp1 + temp2);
+  }
+};
+
+template <typename T>
+struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    auto temp1 = (x < (threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > threshold).template cast<T>().eval();
+    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
+  }
+};
+
 // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < lambda; 0
 // otherwise
 template <typename T>
@@ -351,8 +384,6 @@ template <typename T>
 struct Relu6Functor : public BaseActivationFunctor<T> {
   float threshold;
 
-  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
-  // not polymorphism for speed.
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
@@ -555,4 +586,5 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
   __macro(relu6, Relu6Functor, Relu6GradFunctor);                 \
   __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);    \
   __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
-  __macro(elu, ELUFunctor, ELUGradFunctor)
+  __macro(elu, ELUFunctor, ELUGradFunctor);                       \
+  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor)
diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py
index 9157e00f6e..52e027bd54 100644
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -78,6 +78,26 @@ class TestTanhShrink(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.008)
 
 
+class TestHardShrink(OpTest):
+    def setUp(self):
+        self.op_type = "hard_shrink"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        threshold = 0.5
+
+        self.inputs = {'X': x}
+        self.attrs = {'lambda': threshold}
+
+        t = np.copy(x)
+        t[(t >= -threshold) & (t <= threshold)] = 0
+        self.outputs = {'Y': t}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.005)
+
+
 class TestSoftShrink(OpTest):
     def setUp(self):
         self.op_type = "softshrink"

From 696874ac6ee1b2b284d9817988aa4c99f74c0c76 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 10 Oct 2017 19:54:01 -0700
Subject: [PATCH 55/61] Optimizer Design (#4656)

* init optimizer design

* fix index

* optimize the interface

* add a link to python_api.md

* optimize the code of Optimizer
---
 doc/design/optimizer.md  | 105 +++++++++++++++++++++++++++++++++++++++
 doc/design/python_api.md |   4 ++
 2 files changed, 109 insertions(+)
 create mode 100644 doc/design/optimizer.md

diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md
new file mode 100644
index 0000000000..17440fae50
--- /dev/null
+++ b/doc/design/optimizer.md
@@ -0,0 +1,105 @@
+## Optimizer Design
+
+### The Problem
+
+A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
+
+1. the forward pass, which computes intermediate results and the cost(s),
+1. the backward pass, which derives gradients from intermediate results and costs, and
+1. the optimization pass, which update model parameters to optimize the cost(s).
+
+These works rely on three kinds of operators:
+
+1. forward operators,
+1. gradient operators, and
+1. optimization operators.
+
+It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
+
+In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
+
+
+### High-level Python API to describe the training process
+
+1. User write code to describe the network:
+
+	```python
+	images = layer.data("images")
+	labels = layer.data("labels")
+	w1 = pd.var("w1")
+	b1 = pd.var("b1")
+	hidden = layer.fc(images, w=w1, b=b1)
+	cost = layer.mse(hidden, labels)
+	```
+
+	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+
+
+2. Users create a certain kind of Optimizer with some argument.
+
+	```python
+	optimizer = AdagradOptimizer(learing_rate=0.001)
+	```
+
+3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
+
+	```python
+	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
+	```
+	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
+
+4. Users use Session/Executor to run this opt_op_list as target to do training.
+
+	```python
+	sess.run(target= opt_op_list, ...)
+	```
+
+#### Optimizer Python interface:
+
+```python
+class Optimizer(object):
+    """Optimizer Base class.
+
+    """
+
+    def __init__(self):
+        pass
+
+    def create_backward_pass(self, loss, parameter_list=None):
+        """
+        create and add gradient Operators in BlockDesc to Compute gradients of `loss`
+        for parameters in parameter_list
+
+        Args:
+          loss: an variable generated by cost function.
+          parameter_list: parameters that need to compute gradient and update to optimize the lost.
+
+        Returns:
+          list of (parameters, gradients) pair.
+        """
+        return None
+
+    def create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to variables.
+
+        Args:
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+
+        Returns:
+          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+        """
+        return None
+
+    def minimize(self, loss, parameter_list):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+
+        This method combines interface `create_backward_pass()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = self.create_backward_pass(loss, parameter_list)
+        update_ops = self.create_optimization_pass(params_grads)
+        return update_ops
+
+```
+
+Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
diff --git a/doc/design/python_api.md b/doc/design/python_api.md
index c4665e44fc..56ae1d925a 100644
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
@@ -214,3 +214,7 @@ def fc_layer(input, size, ...):
     out.writer = op
     return out
 ```
+
+## Optimizer
+
+[Optimizer Design Doc](./optimizer.md)

From 23407e7a649d07d0b92219d926179f1dcf6f9326 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 10 Oct 2017 19:57:44 -0700
Subject: [PATCH 56/61] Design doc of SelectedRows (#4652)

* Design doc of SelectedRows

* Follow comments

* Update protobuf message

* Follow comments, seperate LoDTensorDesc and SelectedRows Desc
---
 doc/design/selected_rows.md | 74 +++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 doc/design/selected_rows.md

diff --git a/doc/design/selected_rows.md b/doc/design/selected_rows.md
new file mode 100644
index 0000000000..9e6f3b20cb
--- /dev/null
+++ b/doc/design/selected_rows.md
@@ -0,0 +1,74 @@
+# Design Doc: Selected Rows
+
+`SelectedRows` is a kind of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in that tensor. It is straightforward to represent the sparse tensor by the following sparse tensor data structure:
+
+```cpp
+class SelectedRows {
+ private:
+  vector<int> rows_;
+  Tensor value_;
+  int height_;
+};
+```
+
+The field `height_` shows the first dimension of `SelectedRows`. The `rows` are the indices of which rows of `SelectedRows` are non-zeros. The `value_` field is an N-dim tensor and shape is `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
+
+Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
+
+```
+x = SelectedRow {
+  rows = [73, 84],
+  value = [[1, 2], [3,4]]
+}
+```
+
+
+## SelectedRows in Protobuf
+
+`SelectedRows` is a kind of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time since the `rows_` and `value_` are related to training data. 
+So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
+
+```proto
+message TensorDesc {
+  required DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+
+message LodTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
+
+message VarDesc {
+  required string name = 1;
+  enum VarType { 
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LodTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
+}
+```
+
+## InferShape for Selected Rows
+
+Just like `LoD` information, `InferShape` method will inference output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
+
+For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
+
+```cpp
+void TableLookupGrad::InferShape(context) {
+  ...
+  context.SetDataType("Embedding.Grad", kSelectedRows);
+}
+```
+
+
+## Sparse Operators
+
+There are several operators should be written to support `SelectedRows`. They are:
+
+1. Operators which generates `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
+2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.

From f5ac335046feb81529e85cd0c386379746771157 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 11 Oct 2017 11:02:26 +0800
Subject: [PATCH 57/61] follow comments

---
 paddle/operators/math/CMakeLists.txt  |  5 ++-
 paddle/operators/math/vol2col_test.cc | 47 +++++++++++++--------------
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index d32924db85..2fd559e90a 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -3,14 +3,13 @@ if(WITH_GPU)
     nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
-    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context operator)
+    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator)
     cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
-    cc_library(vol2col SRCS vol2col.cc DEPS device_context operator)
-
+    cc_library(vol2col SRCS vol2col.cc DEPS device_context)
 endif()
 
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index e3c599da87..81225e9a98 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -18,10 +18,9 @@ limitations under the License. */
 
 template <typename Place>
 void testVol2col() {
-  paddle::framework::Tensor input_tmp;
   paddle::framework::Tensor input;
-  paddle::framework::Tensor output_cfo;
-  paddle::framework::Tensor output_ocf;
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor output;
   paddle::framework::Tensor output_tmp;
 
   auto* place = new Place();
@@ -44,14 +43,14 @@ void testVol2col() {
    *          [6, 7, 8,
    *          9, 10, 11]]
    *
-   * output_cfo = [0, 1
-   *               1, 2
-   *               3, 4
-   *               4, 5
-   *               6, 7
-   *               7, 8
-   *               9, 10
-   *               10, 11]
+   * output = [0, 1
+   *           1, 2
+   *           3, 4
+   *           4, 5
+   *           6, 7
+   *           7, 8
+   *           9, 10
+   *           10, 11]
    *
    * col2vol = [[0, 2, 2,
    *             3, 8, 5]
@@ -81,20 +80,20 @@ void testVol2col() {
   } else {
     input.CopyFrom<float>(input_tmp, *place);
   }
-  output_cfo.mutable_data<float>({1, filter_size, filter_size, filter_size,
-                                  output_depth, output_height, output_width},
-                                 *place);
+  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
+                              output_depth, output_height, output_width},
+                             *place);
 
   paddle::operators::math::Vol2ColFunctor<Place, float> vol2col;
-  vol2col(*context, input, output_cfo, stride, stride, stride, padding, padding,
+  vol2col(*context, input, output, stride, stride, stride, padding, padding,
           padding);
 
   float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
   float* out_cfo_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
-    out_cfo_ptr = output_cfo.data<float>();
+    out_cfo_ptr = output.data<float>();
   } else {
-    output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace());
+    output_tmp.CopyFrom<float>(output, paddle::platform::CPUPlace());
     out_cfo_ptr = output_tmp.data<float>();
   }
 
@@ -112,25 +111,25 @@ void testVol2col() {
   }
 
   paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
-  col2vol(*context, input, output_cfo, stride, stride, stride, padding, padding,
+  col2vol(*context, input, output, stride, stride, stride, padding, padding,
           padding);
 
-  float* in_cfo_ptr;
+  float* in_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
-    in_cfo_ptr = input.data<float>();
+    in_ptr = input.data<float>();
   } else {
     input_tmp.CopyFrom<float>(input, paddle::platform::CPUPlace());
-    in_cfo_ptr = input_tmp.data<float>();
+    in_ptr = input_tmp.data<float>();
   }
 
   for (int i = 0; i < 12; ++i) {
-    EXPECT_EQ(in_cfo_ptr[i], col_2_vol[i]);
+    EXPECT_EQ(in_ptr[i], col_2_vol[i]);
   }
 }
 
 TEST(math, vol2col) {
   testVol2col<paddle::platform::CPUPlace>();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testVol2col<paddle::platform::GPUPlace>();
-#endif
+#endif  // PADDLE_WITH_CUDA
 }

From 7454ec0400429676edaa46578b024ff4bd4c028e Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 11 Oct 2017 03:48:43 +0000
Subject: [PATCH 58/61] Simplify backward when inserting a sum operator to
 accumulate all duplicated variables.

---
 paddle/framework/backward.cc | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 0a4688db9c..063b108500 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -172,30 +172,14 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                               std::to_string(i));
         net->ops_[op_offset]->Rename(name, dup_outputs.back());
       }
-      // collect all the offset to append `add` op for each alias
-      //
-      // one variable is shared between multiple operators.
-      // insert add operator one by one, then add it to output
-      for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
-           ++output_idx) {
-        auto insert_add_x = dup_outputs[output_idx];
-        auto insert_add_y = dup_outputs[output_idx + 1];
-        auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
-        // first add op inserted
-        if (output_idx == dup_outputs.size() - 2) {
-          insert_add_out = name;
-        }
-        if (output_idx != 0) {
-          insert_add_y = name + "@SHARED@" + std::to_string(output_idx - 1);
-        }
-        insert_position.push_back(
-            {dup_op.back(),
-             OpRegistry::CreateOp("sum", {{"X", {insert_add_x, insert_add_y}}},
-                                  {{"Out", {insert_add_out}}}, {})});
-      }
+      // collect all the offset for each alias,
+      // insert a sum operator to add all aliases to output
+      insert_position.push_back(
+          {dup_op.back(), OpRegistry::CreateOp("sum", {{"X", dup_outputs}},
+                                               {{"Out", {name}}}, {})});
     }
 
-    // make sure the inserted `add` ops follow the BFS order.
+    // make sure the inserted `sum` ops follow the BFS order.
     insert_position.sort(
         [](const Pos& l, const Pos& r) { return l.first > r.first; });
 

From 9995aed114148ff96b8e06f1548cff0445fce628 Mon Sep 17 00:00:00 2001
From: kexinzhao <19hskevin87@gmail.com>
Date: Tue, 10 Oct 2017 21:14:48 -0700
Subject: [PATCH 59/61] Implementing Softplus operator (#4690)

* implementing softplus

* small fix

* small fix

* small fix

* small fix
---
 paddle/operators/activation_op.cc             | 14 ++++++++++
 paddle/operators/activation_op.h              | 28 +++++++++++++++++++
 .../v2/framework/tests/test_activation_op.py  | 15 ++++++++++
 3 files changed, 57 insertions(+)

diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 61a201b6cd..ced14a8923 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -206,6 +206,17 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftplusOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softplus operator");
+    AddOutput("Y", "Output of Softplus operator");
+    AddComment("Softplus activation operator, softplus(x) = log(1 + exp(x))");
+  }
+};
+
 class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SoftsignOpMaker(framework::OpProto *proto,
@@ -351,6 +362,9 @@ REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
 REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
             ops::ActivationOpGrad);
 
+REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
+            ops::ActivationOpGrad);
+
 REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
             ops::ActivationOpGrad);
 
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 29f159bbae..f88c9c48eb 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -407,6 +407,33 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+// softplus(x) = log(1 + exp(x))
+// When x is a very large positive number, exp(x) may explode to inf,
+// Using trick below for numerical stability
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
+template <typename T>
+struct SoftplusFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    y.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
+  }
+};
+
+// d(softplus(x))/dx = exp(x) / (1 + exp(x))
+// For numerical stability:
+// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) +
+// exp(x - max(x, 0)))
+template <typename T>
+struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    dx.device(d) = dy * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
+  }
+};
+
 // softsign(x) = x / (1 + |x|)
 template <typename T>
 struct SoftsignFunctor : public BaseActivationFunctor<T> {
@@ -582,6 +609,7 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
   __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);       \
   __macro(pow, PowFunctor, PowGradFunctor);                       \
   __macro(stanh, STanhFunctor, STanhGradFunctor);                 \
+  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);        \
   __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);        \
   __macro(relu6, Relu6Functor, Relu6GradFunctor);                 \
   __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);    \
diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py
index 52e027bd54..a28c4431e1 100644
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -331,6 +331,21 @@ class TestSTanh(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
+class TestSoftplus(OpTest):
+    def setUp(self):
+        self.op_type = "softplus"
+        self.inputs = {
+            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.log(1 + np.exp(self.inputs['X']))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
 class TestSoftsign(OpTest):
     def setUp(self):
         self.op_type = "softsign"

From 82a2b1a92db573f0021d145a96f4bddbaf0606e8 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 11 Oct 2017 15:29:48 +0800
Subject: [PATCH 60/61] fix Compile error [fatal error:
 boost/range/adaptor/reversed.hpp No such file or directory]

---
 paddle/framework/executor.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 886e9ab33e..c388b2198e 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -24,8 +24,6 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 
-#include <boost/range/adaptor/reversed.hpp>
-
 namespace paddle {
 namespace framework {
 

From 4849fba7e98044b4d2e951638562342da5d399fe Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 11 Oct 2017 15:36:23 +0800
Subject: [PATCH 61/61] follow comments

---
 paddle/operators/math/CMakeLists.txt   |  4 +-
 paddle/operators/pool_op.cc            | 41 +++++++++++++++-----
 paddle/operators/pool_with_index_op.cc | 52 +++++++++++++++++++-------
 3 files changed, 72 insertions(+), 25 deletions(-)

diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 2c1bc6d910..1a2f623ce7 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -3,14 +3,14 @@ if(WITH_GPU)
     nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
-    nv_library(pooling SRCS pooling.cc pooling.cu DEPS operator)
+    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
-    cc_library(pooling SRCS pooling.cc DEPS operator)
+    cc_library(pooling SRCS pooling.cc DEPS device_context)
     cc_library(vol2col SRCS vol2col.cc DEPS device_context)
 endif()
 
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index 25fd01844b..c6d9aae133 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -35,7 +35,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
   PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                 "Pooling intput should be 4-D or 5-D");
+                 "Pooling intput should be 4-D or 5-D tensor.");
 
   if (ctx->Attrs().Get<bool>("globalPooling")) {
     ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
@@ -70,11 +70,11 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "X",
-      "The input tensor of pooling operator. "
+      "(Tensor) The input tensor of pooling operator. "
       "The format of input tensor is NCHW. Where N is batch size, C is the "
       "number of channels, H and W is the height and width of feature.");
   AddOutput("Out",
-            "The output tensor of pooling operator."
+            "(Tensor) The output tensor of pooling operator."
             "The format of output tensor is also NCHW."
             "Where N is batch size, C is "
             "the number of channels, H and W is the height and "
@@ -87,7 +87,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
 
   AddAttr<std::vector<int>>(
       "ksize",
-      "The pooling size(height, width) of pooling operator."
+      "The pooling window size(height, width) of pooling operator."
       "If globalPooling = true, ksize is ignored and need not be "
       "specified.");  // TODO(Chengduo): Add checker. (Currently,
                       // TypedAttrChecker don't support vector type.)
@@ -99,12 +99,12 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
       "If globalPooling = true, ksize is ignored and need not be specified.")
       .SetDefault(false);
   AddAttr<std::vector<int>>("strides",
-                            "Strides(height, width) of pooling operator."
+                            "The strides(height, width) of pooling window."
                             "Default {1,1}.")
       .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
                             // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>("paddings",
-                            "Paddings(height, width) of pooling operator."
+                            "The zero padding(height, width) size on both sides"
                             "Default {0,0}.")
       .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
                             // TypedAttrChecker don't support vector type.)
@@ -116,6 +116,17 @@ Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the
 number of channels, H and W is the height and width of feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       X shape: (N, C, H_in, W_in)
+  Output:
+       Out shape: (N, C, H_out, W_out)
+       Mask shape: (N, C, H_out, W_out)
+  where
+       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
 )DOC");
 }
 
@@ -124,12 +135,12 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "X",
-      "The input tensor of pooling operator. "
+      "(Tensor) The input tensor of pooling operator. "
       "The format of input tensor is NCDHW. Where N is batch size, C is "
       "the number of channels, D, H and W is the depth, height and width of "
       "feature.");
   AddOutput("Out",
-            "The output tensor of pooling operator."
+            "(Tensor) The output tensor of pooling operator."
             "The format of output tensor is also NCDHW."
             "Where N is batch size, C is "
             "the number of channels, D, H and W is the depth, height and "
@@ -142,7 +153,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
 
   AddAttr<std::vector<int>>(
       "ksize",
-      "The pooling size(depth, height, width) of pooling operator."
+      "The pooling window size(depth, height, width) of pooling operator."
       "If globalPooling = true, ksize is ignored and need not be "
       "specified.");  // TODO(Chengduo): Add checker. (Currently,
                       // TypedAttrChecker don't support vector type.)
@@ -172,6 +183,18 @@ Input(X) and output(Out) are in NCDHW format. Where N is batch
 size, C is the number of channels, D, H and W is the depth, height and
 width of feature. Parameters(ksize, strides, paddings) are three elements.
 These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       X shape: (N, C, D_in, H_in, W_in)
+  Output:
+       Out shape: (N, C, D_out, H_out, W_out)
+       Mask shape: (N, C, D_out, H_out, W_out)
+  where
+       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
 )DOC");
 }
 }  // namespace operators
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index ae6a81d871..005ee88693 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -43,7 +43,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
     PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                   "Pooling intput should be 4-D or 5-D");
+                   "Pooling intput should be 4-D or 5-D tensor.");
 
     if (ctx->Attrs().Get<bool>("globalPooling")) {
       ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
@@ -74,8 +74,8 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                    "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
@@ -89,17 +89,17 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
-        "The input tensor of pooling operator. "
+        "(Tensor) The input tensor of pooling operator. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of image.");
     AddOutput("Out",
-              "The output tensor of pooling operator."
+              "(Tensor) The output tensor of pooling operator."
               "The format of output tensor is also NCHW."
               "Where N is batch size, C is "
               "the number of channels, H and W is the height and "
               "width of image.");
     AddOutput("Mask",
-              "The Mask tensor of pooling operator."
+              "(Tensor) The Mask tensor of pooling operator."
               "The format of output tensor is also NCHW."
               "Where N is batch size, C is the number of channels, H and W "
               "is the height and width of image."
@@ -107,7 +107,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddAttr<std::vector<int>>(
         "ksize",
-        "The pooling size(height, width) of pooling operator."
+        "The pooling window size(height, width) of pooling operator."
         "If globalPooling = true, ksize is ignored and need not be "
         "specified.");  // TODO(Chengduo): Add checker. (Currently,
                         // TypedAttrChecker don't support vector type.)
@@ -119,13 +119,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
         "If globalPooling = true, ksize is ignored and need not be specified.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
-                              "Strides(height, width) of pooling operator."
+                              "The strides(height, width) of pooling window."
                               "Default {1,1}.")
         .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
                               // TypedAttrChecker don't support vector type.)
-    AddAttr<std::vector<int>>("paddings",
-                              "Paddings(height, width) of pooling operator."
-                              "Default {0,0}.")
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "The zero padding(height, width) size on both sides"
+        "Default {0,0}.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
                               // TypedAttrChecker don't support vector type.)
 
@@ -136,6 +137,17 @@ output(Out, Mask) are in NCHW format. Where N is batch size, C is the
 number of channels, H and W is the height and width of feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
+The input(X) size and output(Out, Mask) size may be different.
+
+Example:
+  Input:
+       X shape: (N, C, H_in, W_in)
+  Output:
+       Out shape: (N, C, H_out, W_out)
+       Mask shape: (N, C, H_out, W_out)
+  where
+       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
 )DOC");
   }
 };
@@ -147,18 +159,18 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
-        "The input tensor of pooling operator. "
+        "(Tensor) The input tensor of pooling operator. "
         "The format of input tensor is NCDHW. Where N is batch size, C is "
         "the number of channels, D, H and W is the depth, height and width of "
         "image.");
     AddOutput("Out",
-              "The output tensor of pooling operator."
+              "(Tensor) The output tensor of pooling operator."
               "The format of output tensor is also NCDHW."
               "Where N is batch size, C is "
               "the number of channels, D, H and W is the depth, height and "
               "width of image.");
     AddOutput("Mask",
-              "The Mask tensor of pooling operator."
+              "(Tensor) The Mask tensor of pooling operator."
               "The format of output tensor is also NCDHW."
               "Where N is batch size, C is the number of channels, D, H and W "
               "is the depth, height and width of image."
@@ -166,7 +178,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddAttr<std::vector<int>>(
         "ksize",
-        "The pooling size(depth, height, width) of pooling operator."
+        "The pooling window size(depth, height, width) of pooling operator."
         "If globalPooling = true, ksize is ignored and need not be "
         "specified.");  // TODO(Chengduo): Add checker. (Currently,
                         // TypedAttrChecker don't support vector type.)
@@ -197,6 +209,18 @@ Input(X) and output(Out, Mask) are in NCDHW format. Where N is batch
 size, C is the number of channels, D, H and W is the depth, height and
 width of feature. Parameters(ksize, strides, paddings) are three elements.
 These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out, Mask) size may be different.
+
+Example:
+  Input:
+       X shape: (N, C, D_in, H_in, W_in)
+  Output:
+       Out shape: (N, C, D_out, H_out, W_out)
+       Mask shape: (N, C, D_out, H_out, W_out)
+  where
+       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
 )DOC");
   }
 };