From 7683e35816f448351e4a4037b5b4c6f55e34835d Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Sun, 3 Sep 2017 23:17:43 +0000
Subject: [PATCH 01/17] cond op

---
 paddle/operators/cond_op.cc |  56 +++++++++++++++
 paddle/operators/cond_op.h  | 131 ++++++++++++++++++++++++++++++++++++
 2 files changed, 187 insertions(+)
 create mode 100644 paddle/operators/cond_op.cc
 create mode 100644 paddle/operators/cond_op.h
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
new file mode 100644
index 0000000000..be5e0e6a5b
--- /dev/null
+++ b/paddle/operators/cond_op.cc
@@ -0,0 +1,56 @@
+#include "paddle/operators/switch_op.h"
+
+namespace paddle {
+namespace operators {
+
+void CondOp::InferShape(const std::shared_ptr<Scope>& scope) const {
+  // Create two Nets
+  // Create two scopes
+  for (int i = 0; i < 2; ++i)
+    sub_scope.push_back(scope.NewScope());
+
+  for (int i = 0; i < 2; ++i)
+    sub_net_op_[i].InferShape(sub_scope[i]);
+
+  for (int i = 0; i < 2; ++i)
+    tensor_index = new Tensor();
+
+  for (int i = 0; i < 2; ++i)
+    _index.push_back(vector<int>());
+  
+  for (int i = 0; i < 2; ++i)
+  {
+    // for (auto& input : net_op_[i]->Inputs()) {
+    for (auto& input : GetAttr<std::vector<std::string>>("True_inputs")) {
+      auto var_name = input.second;
+      // Create a new tensor in sub-scope for input-type tensor
+      sub_scope[i]->NewVar(var_name)->GetMutable<Tensor>();
+    }
+  }
+}
+
+class CondOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+public:
+  CondOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Cond", "The condition, which is a bool vector");
+    AddInput("Xs", "Inputs of Subnets");
+    AddAttr<std::vector<std::string>>("sub_inputs", "Inputs of the Whole Op, net op and so forth");
+    AddAttr<std::vector<std::string>>("sub_outputs", "True Outputs needs merge");
+    AddOutput("Outs", "The output of cond op");
+
+    AddComment(R"DOC(
+Sample dependent Cond Operator:
+The equation is: Out[i] = subnet_t[i], if Cond[i] == true
+Out[i] = subnet_t[i], if Cond[i] == false
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(cond_op,
+            paddle::operators::CondOp,
+            paddle::operators::CondOpProtoAndCheckerMaker);
+
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
new file mode 100644
index 0000000000..e9ae41b191
--- /dev/null
+++ b/paddle/operators/cond_op.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/operators/gather.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using namespace paddle::framework;
+
+template <typename Place, typename T>
+class CondOp final : public OperatorBase {
+public:
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const std::shared_ptr<Scope>& scope) const override;
+
+  // Set True Block
+  void set_truenet(std::unique_ptr<OperatorBase> net) {
+    sub_net_op_[0] = std::move(net);
+  }
+
+  // Set False Block
+  void set_falsenet(std::unique_ptr<OperatorBase> net) {
+    sub_net_op_[1] = std::move(net);
+  }
+
+  virtual void Run(const std::shared_ptr<Scope>& scope,
+                   const platform::DeviceContext& dev_ctx) const override {
+    auto* cond = context.Input<Tensor>("Cond");
+    // Step 1: get the true/false index at runtime
+    // _index[0]: vector<int>, contains all index for cond[i] == true
+    // _index[1]: vector<int>, contains all index for cond[i] == false
+    for(int i = 0; i < 2; ++i)
+      _index[i].clear();
+    for(int i = 0; i < cond->dims()[0]; ++i) {
+      if (cond->data<bool>()[i])
+        _index[0].push_back(i);
+      else
+        _index[1].push_back(i);
+    }
+    // put _index[0] and _index[1] into two tensors
+    // tensor_index[0] and tensor_index[1]
+    framework::DDim dim_ = paddle::framework::make_ddim({0});
+    for(int i = 0; i < 2; ++i) {
+      dim_[0] = _index[i].size();
+      int* tmp_ = _index[i]->mutable_data<int>(dim_, CPUPlace());
+      tensor_index[i]->Resize(dim_);
+      memcpy(tmp_, index_[i], dim_[0] * sizeof(int));
+    }
+    
+    
+    // Step 2: collect data by calling gather
+    for (int i = 0; i < 2; ++i) { 
+      // i= 0/i for True and False branches respectively
+      for (auto& input : GetAttr<std::vector<std::string>>("sub_inputs")) {
+        auto var_name = input.second;
+        // find Tensor
+        Tensor* Tensor_parent = scope.FindVar(var_name)->GetMutable<Tensor>();
+        Tensor* Tensor_child = sub_scope_[i].FindVar(var_name)->GetMutable<Tensor>();
+        Gather<T>(dev_ctx.GetPlace(), tensor_parent, tensor_index[i], tensor_child); 
+      }
+    }
+
+    // Step 3: run
+    for (int i = 0; i < 2; ++i)
+      sub_net_op_[i]->Run(sub_scope_[i], dev_ctx);
+
+    // Step 4: merge output results
+    for (int i = 0; i < 2; ++i) {
+      // i= 0/i for True and False branches respectively
+      for (auto& output : GetAttr<std::vector<std::string>>("sub_outputs")) {
+        auto var_name = output.second;
+        // find Tensor
+        Tensor* Tensor_parent = scope.FindVar(var_name)->GetMutable<Tensor>();
+        Tensor* Tensor_child = sub_scope_[i].FindVar(var_name)->GetMutable<Tensor>();
+        ScatterUpdate<T>(dev_ctx.GetPlace(), tensor_child, tensor_index[i], tensor_parent);
+      }
+    }
+  }
+
+private:
+  // sub_scope_[0]: true scope
+  // sub_scope_[1]: false scope
+  std::vector<Scope*> sub_scope_;
+
+  // sub_net_op_[0]: subnet_t
+  // sub_net_op_[1]: subnet_f
+  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
+
+  // tensor_index[0]: True_index tensor
+  // tensor_index[1]: False_index;
+  std::vector<Tensor*> tensor_index;
+
+  // _index[0]: True_index; 
+  // _index[1]: False_index;
+  vector<vector<int> > _index;
+};
+
+/*
+class CondGradientOp final : public OperatorBase {
+public:
+	void Init() override;
+
+	virtual void InferShape(const std::shared_ptr<Scope>& scope) const override;
+
+	virtual void Run(const std::shared_ptr<Scope>& scope,
+                   const platform::DeviceContext& dev_ctx) const override;
+};*/
+
+}  // namespace operators
+}  // namespace paddle
+

From adfef243d2d83e90fe59488864486f6db9449cc3 Mon Sep 17 00:00:00 2001
From: Zhuoyuan <chenzhuoyuan07@gmail.com>
Date: Tue, 5 Sep 2017 23:41:48 +0000
Subject: [PATCH 02/17] tensor element size support

---
 paddle/framework/tensor.h       |  11 ++-
 paddle/framework/tensor_impl.h  |   2 +-
 paddle/framework/tensor_test.cc |   2 +
 paddle/operators/cond_op.cc     |  56 --------------
 paddle/operators/cond_op.h      | 131 --------------------------------
 5 files changed, 13 insertions(+), 189 deletions(-)
 delete mode 100644 paddle/operators/cond_op.cc
 delete mode 100644 paddle/operators/cond_op.h

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 643f875491..657d3e6628 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -75,6 +75,9 @@ class Tensor {
   template <typename T>
   inline T* mutable_data(DDim dims, platform::Place place);
 
+  /*! Size of a single element in data() */
+  inline size_t element_size() { return holder_->element_size(); }
+
   /*! Return the dimensions of the memory block. */
   inline const DDim& dims() const;
 
@@ -123,6 +126,7 @@ class Tensor {
     virtual ~Placeholder() {}
     virtual void* ptr() const = 0;
     virtual size_t size() const = 0;
+    virtual size_t element_size() const = 0;
     virtual std::type_index type() const = 0;
     virtual platform::Place place() const = 0;
   };
@@ -133,7 +137,8 @@ class Tensor {
         : ptr_(static_cast<T*>(memory::Alloc(place, size)),
                memory::PODDeleter<T, Place>(place)),
           place_(place),
-          size_(size) {
+          size_(size),
+          element_size_(sizeof(T)) {
       PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                               (is_cpu_place(place_) ? "CPU" : "GPU"));
     }
@@ -142,6 +147,7 @@ class Tensor {
     virtual platform::Place place() const { return place_; }
     virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual std::type_index type() const { return std::type_index(typeid(T)); }
+    virtual size_t element_size() const { return element_size_; }
 
     /*! the pointer of memory block. */
     std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_;
@@ -151,6 +157,9 @@ class Tensor {
 
     /*! the size of memory block. */
     size_t size_;
+
+    /*! the size of a single element */
+    size_t element_size_;
   };
 
   /*! holds the memory block if allocated. */
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 7893e233b7..6a989a31cc 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -22,7 +22,7 @@ namespace framework {
 template <typename T>
 inline void Tensor::check_memory_size() const {
   PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tenosr holds no memory. Call Tensor::mutable_data first.");
+      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
   PADDLE_ENFORCE_GE(
       holder_->size(), product(dims_) * sizeof(T) + offset_,
       "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 7db38d5cae..da0a4d6363 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -59,6 +59,8 @@ TEST(Tensor, MutableData) {
     // initialization
     p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
     EXPECT_NE(p1, nullptr);
+    // check tensor type
+    EXPECT_EQ(src_tensor.element_size(), sizeof(float));
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
     p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace());
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
deleted file mode 100644
index be5e0e6a5b..0000000000
--- a/paddle/operators/cond_op.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-#include "paddle/operators/switch_op.h"
-
-namespace paddle {
-namespace operators {
-
-void CondOp::InferShape(const std::shared_ptr<Scope>& scope) const {
-  // Create two Nets
-  // Create two scopes
-  for (int i = 0; i < 2; ++i)
-    sub_scope.push_back(scope.NewScope());
-
-  for (int i = 0; i < 2; ++i)
-    sub_net_op_[i].InferShape(sub_scope[i]);
-
-  for (int i = 0; i < 2; ++i)
-    tensor_index = new Tensor();
-
-  for (int i = 0; i < 2; ++i)
-    _index.push_back(vector<int>());
-  
-  for (int i = 0; i < 2; ++i)
-  {
-    // for (auto& input : net_op_[i]->Inputs()) {
-    for (auto& input : GetAttr<std::vector<std::string>>("True_inputs")) {
-      auto var_name = input.second;
-      // Create a new tensor in sub-scope for input-type tensor
-      sub_scope[i]->NewVar(var_name)->GetMutable<Tensor>();
-    }
-  }
-}
-
-class CondOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
-public:
-  CondOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Cond", "The condition, which is a bool vector");
-    AddInput("Xs", "Inputs of Subnets");
-    AddAttr<std::vector<std::string>>("sub_inputs", "Inputs of the Whole Op, net op and so forth");
-    AddAttr<std::vector<std::string>>("sub_outputs", "True Outputs needs merge");
-    AddOutput("Outs", "The output of cond op");
-
-    AddComment(R"DOC(
-Sample dependent Cond Operator:
-The equation is: Out[i] = subnet_t[i], if Cond[i] == true
-Out[i] = subnet_t[i], if Cond[i] == false
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(cond_op,
-            paddle::operators::CondOp,
-            paddle::operators::CondOpProtoAndCheckerMaker);
-
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
deleted file mode 100644
index e9ae41b191..0000000000
--- a/paddle/operators/cond_op.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "glog/logging.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/operators/gather.h"
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using namespace paddle::framework;
-
-template <typename Place, typename T>
-class CondOp final : public OperatorBase {
-public:
-  /**
-   * InferShape must be called before Run.
-   */
-  void InferShape(const std::shared_ptr<Scope>& scope) const override;
-
-  // Set True Block
-  void set_truenet(std::unique_ptr<OperatorBase> net) {
-    sub_net_op_[0] = std::move(net);
-  }
-
-  // Set False Block
-  void set_falsenet(std::unique_ptr<OperatorBase> net) {
-    sub_net_op_[1] = std::move(net);
-  }
-
-  virtual void Run(const std::shared_ptr<Scope>& scope,
-                   const platform::DeviceContext& dev_ctx) const override {
-    auto* cond = context.Input<Tensor>("Cond");
-    // Step 1: get the true/false index at runtime
-    // _index[0]: vector<int>, contains all index for cond[i] == true
-    // _index[1]: vector<int>, contains all index for cond[i] == false
-    for(int i = 0; i < 2; ++i)
-      _index[i].clear();
-    for(int i = 0; i < cond->dims()[0]; ++i) {
-      if (cond->data<bool>()[i])
-        _index[0].push_back(i);
-      else
-        _index[1].push_back(i);
-    }
-    // put _index[0] and _index[1] into two tensors
-    // tensor_index[0] and tensor_index[1]
-    framework::DDim dim_ = paddle::framework::make_ddim({0});
-    for(int i = 0; i < 2; ++i) {
-      dim_[0] = _index[i].size();
-      int* tmp_ = _index[i]->mutable_data<int>(dim_, CPUPlace());
-      tensor_index[i]->Resize(dim_);
-      memcpy(tmp_, index_[i], dim_[0] * sizeof(int));
-    }
-    
-    
-    // Step 2: collect data by calling gather
-    for (int i = 0; i < 2; ++i) { 
-      // i= 0/i for True and False branches respectively
-      for (auto& input : GetAttr<std::vector<std::string>>("sub_inputs")) {
-        auto var_name = input.second;
-        // find Tensor
-        Tensor* Tensor_parent = scope.FindVar(var_name)->GetMutable<Tensor>();
-        Tensor* Tensor_child = sub_scope_[i].FindVar(var_name)->GetMutable<Tensor>();
-        Gather<T>(dev_ctx.GetPlace(), tensor_parent, tensor_index[i], tensor_child); 
-      }
-    }
-
-    // Step 3: run
-    for (int i = 0; i < 2; ++i)
-      sub_net_op_[i]->Run(sub_scope_[i], dev_ctx);
-
-    // Step 4: merge output results
-    for (int i = 0; i < 2; ++i) {
-      // i= 0/i for True and False branches respectively
-      for (auto& output : GetAttr<std::vector<std::string>>("sub_outputs")) {
-        auto var_name = output.second;
-        // find Tensor
-        Tensor* Tensor_parent = scope.FindVar(var_name)->GetMutable<Tensor>();
-        Tensor* Tensor_child = sub_scope_[i].FindVar(var_name)->GetMutable<Tensor>();
-        ScatterUpdate<T>(dev_ctx.GetPlace(), tensor_child, tensor_index[i], tensor_parent);
-      }
-    }
-  }
-
-private:
-  // sub_scope_[0]: true scope
-  // sub_scope_[1]: false scope
-  std::vector<Scope*> sub_scope_;
-
-  // sub_net_op_[0]: subnet_t
-  // sub_net_op_[1]: subnet_f
-  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-
-  // tensor_index[0]: True_index tensor
-  // tensor_index[1]: False_index;
-  std::vector<Tensor*> tensor_index;
-
-  // _index[0]: True_index; 
-  // _index[1]: False_index;
-  vector<vector<int> > _index;
-};
-
-/*
-class CondGradientOp final : public OperatorBase {
-public:
-	void Init() override;
-
-	virtual void InferShape(const std::shared_ptr<Scope>& scope) const override;
-
-	virtual void Run(const std::shared_ptr<Scope>& scope,
-                   const platform::DeviceContext& dev_ctx) const override;
-};*/
-
-}  // namespace operators
-}  // namespace paddle
-

From d8921e9d4b825650b79ba52a281b01317d0d0952 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 7 Sep 2017 21:32:39 -0700
Subject: [PATCH 03/17] Fix CI test

---
 paddle/framework/tensor.h       | 2 +-
 paddle/framework/tensor_test.cc | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 657d3e6628..9d05b87408 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -76,7 +76,7 @@ class Tensor {
   inline T* mutable_data(DDim dims, platform::Place place);
 
   /*! Size of a single element in data() */
-  inline size_t element_size() { return holder_->element_size(); }
+  inline size_t element_size() const { return holder_->element_size(); }
 
   /*! Return the dimensions of the memory block. */
   inline const DDim& dims() const;
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index da0a4d6363..7747cb34fc 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -36,7 +36,7 @@ TEST(Tensor, DataAssert) {
   } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg =
-        "holder_ should not be null\nTenosr holds no memory. Call "
+        "holder_ should not be null\nTensor holds no memory. Call "
         "Tensor::mutable_data first.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
@@ -114,7 +114,7 @@ TEST(Tensor, ShareDataWith) {
     } catch (paddle::platform::EnforceNotMet err) {
       caught = true;
       std::string msg =
-          "holder_ should not be null\nTenosr holds no memory. Call "
+          "holder_ should not be null\nTensor holds no memory. Call "
           "Tensor::mutable_data first.";
       const char* what = err.what();
       for (size_t i = 0; i < msg.length(); ++i) {

From c5fa417c62257d14d5fc426d5b8319cb4c747b9a Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 8 Sep 2017 11:10:44 -0700
Subject: [PATCH 04/17] Host and device transform API

* with unit-tests
* Also complete `memcpy`
---
 paddle/memory/memcpy.cc           | 18 +++++++
 paddle/platform/CMakeLists.txt    |  1 +
 paddle/platform/transform.h       | 61 ++++++++++++++++++++++
 paddle/platform/transform_test.cu | 84 +++++++++++++++++++++++++++++++
 4 files changed, 164 insertions(+)
 create mode 100644 paddle/platform/transform.h
 create mode 100644 paddle/platform/transform_test.cu

diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index a19a3e3675..19ec9ba9b2 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -62,6 +62,24 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
   }
 }
 
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
+}
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace memory
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 17bdac8749..8b605e51c3 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -24,3 +24,4 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place)
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
new file mode 100644
index 0000000000..fcd300f2d9
--- /dev/null
+++ b/paddle/platform/transform.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/hostdevice.h"
+#include "paddle/platform/place.h"
+
+#include <algorithm>
+#ifdef __NVCC__
+#include <thrust/transform.h>
+#endif
+
+namespace paddle {
+namespace platform {
+
+// Transform on host or device. It provides the same API in std library.
+template <typename Place, typename InputIter, typename OutputIter,
+          typename UnaryOperation>
+void Transform(Place place, InputIter first, InputIter last, OutputIter result,
+               UnaryOperation op) {
+  if (is_cpu_place(place)) {
+    std::transform(first, last, result, op);
+  } else {
+#ifdef __NVCC__
+    thrust::transform(first, last, result, op);
+#else
+    PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
+#endif
+  }
+}
+
+template <typename Place, typename InputIter1, typename InputIter2,
+          typename OutputIter, typename BinaryOperation>
+void Transform(Place place, InputIter1 first1, InputIter1 last1,
+               InputIter2 first2, OutputIter result, BinaryOperation op) {
+  if (is_cpu_place(place)) {
+    std::transform(first1, last1, first2, result, op);
+  } else {
+#ifdef __NVCC__
+    thrust::transform(first1, last1, first2, result, op);
+#else
+    PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
+#endif
+  }
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu
new file mode 100644
index 0000000000..600fed8f45
--- /dev/null
+++ b/paddle/platform/transform_test.cu
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/transform.h"
+
+template <typename T>
+class Scale {
+ public:
+  explicit Scale(const T& scale) : scale_(scale) {}
+
+  HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
+
+ private:
+  T scale_;
+};
+
+template <typename T>
+class Multiply {
+ public:
+  HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+};
+
+TEST(Transform, CPUUnary) {
+  using namespace paddle::platform;
+  float buf[4] = {0.1, 0.2, 0.3, 0.4};
+  Transform(CPUPlace(), buf, buf + 4, buf, Scale<float>(10));
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
+  }
+}
+
+TEST(Transform, GPUUnary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  GPUPlace gpu0(0);
+  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
+  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
+  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
+  Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
+  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
+  Free(gpu0, gpu_buf);
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
+  }
+}
+
+TEST(Transform, CPUBinary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  int buf[4] = {1, 2, 3, 4};
+  Transform(CPUPlace(), buf, buf + 4, buf, buf, Multiply<int>());
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
+  }
+}
+
+TEST(Transform, GPUBinary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  int buf[4] = {1, 2, 3, 4};
+  GPUPlace gpu0(0);
+  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
+  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
+  Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
+  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
+  Free(gpu0, gpu_buf);
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
+  }
+}
\ No newline at end of file

From 6fbf097bccf77f74927e7a19aa879182088558ca Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 11 Sep 2017 20:11:56 -0700
Subject: [PATCH 05/17] Mark thrust::device_ptr in transform

Fix TravisCI
---
 paddle/platform/transform.h | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
index fcd300f2d9..c80446b45c 100644
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -19,13 +19,43 @@
 #include "paddle/platform/place.h"
 
 #include <algorithm>
+#include <type_traits>
 #ifdef __NVCC__
+#include <thrust/device_ptr.h>
 #include <thrust/transform.h>
 #endif
 
 namespace paddle {
 namespace platform {
 
+#ifdef __NVCC__
+template <typename T, bool is_ptr>
+struct DevicePtrCast;
+
+template <typename T>
+struct DevicePtrCast<T, true> {
+  using ELEM = typename std::remove_pointer<T>::type;
+  using RTYPE = thrust::device_ptr<ELEM>;
+
+  inline thrust::device_ptr<ELEM> operator()(ELEM* ele) const {
+    return thrust::device_pointer_cast(ele);
+  }
+};
+
+template <typename T>
+struct DevicePtrCast<T, false> {
+  using RTYPE = T;
+  inline RTYPE operator()(RTYPE it) const { return it; }
+};
+
+template <typename T>
+auto DevCast(T t) ->
+    typename DevicePtrCast<T, std::is_pointer<T>::value>::RTYPE {
+  DevicePtrCast<T, std::is_pointer<T>::value> cast;
+  return cast(t);
+}
+#endif
+
 // Transform on host or device. It provides the same API in std library.
 template <typename Place, typename InputIter, typename OutputIter,
           typename UnaryOperation>
@@ -35,7 +65,7 @@ void Transform(Place place, InputIter first, InputIter last, OutputIter result,
     std::transform(first, last, result, op);
   } else {
 #ifdef __NVCC__
-    thrust::transform(first, last, result, op);
+    thrust::transform(DevCast(first), DevCast(last), DevCast(result), op);
 #else
     PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
 #endif
@@ -50,7 +80,8 @@ void Transform(Place place, InputIter1 first1, InputIter1 last1,
     std::transform(first1, last1, first2, result, op);
   } else {
 #ifdef __NVCC__
-    thrust::transform(first1, last1, first2, result, op);
+    thrust::transform(DevCast(first1), DevCast(last1), DevCast(first2),
+                      DevCast(result), op);
 #else
     PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
 #endif

From b8e75c1f1a0b56993b3b1a528784e9e86d5a7277 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Tue, 12 Sep 2017 15:10:31 -0700
Subject: [PATCH 06/17] cond op

---
 paddle/operators/CMakeLists.txt               |   2 +
 paddle/operators/cond_op.cc                   |  45 ++++
 paddle/operators/cond_op.h                    | 232 ++++++++++++++++++
 paddle/pybind/pybind.cc                       |  23 ++
 python/paddle/v2/framework/op.py              |  22 ++
 .../paddle/v2/framework/tests/test_cond_op.py | 114 +++++++++
 6 files changed, 438 insertions(+)
 create mode 100644 paddle/operators/cond_op.cc
 create mode 100644 paddle/operators/cond_op.h
 create mode 100644 python/paddle/v2/framework/tests/test_cond_op.py

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index f9ea25ab04..639ccd4052 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -55,12 +55,14 @@ set(DEPS_OPS
     minus_op
     mul_op
     recurrent_op
+    cond_op
     scale_op)
 op_library(identity_op DEPS scale_op)
 op_library(minus_op DEPS scale_op)
 op_library(mul_op DEPS math_function)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor operator net_op)
+op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(scale_op DEPS net_op)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
new file mode 100644
index 0000000000..cb7fed7ebd
--- /dev/null
+++ b/paddle/operators/cond_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cond_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CondOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CondOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Cond", "The condition, which is a bool vector");
+    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
+    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
+
+    AddOutput("SubScopes", "sub scopes for true and false branches");
+    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
+
+    AddComment(R"DOC(
+Sample dependent Cond Operator:
+The equation is: Out[i] = subnet_t[i], if Cond[i] == true
+Out[i] = subnet_t[i], if Cond[i] == false
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(cond_op, paddle::operators::CondOp,
+                             paddle::operators::CondOpProtoAndCheckerMaker);
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
new file mode 100644
index 0000000000..b776f8ccd9
--- /dev/null
+++ b/paddle/operators/cond_op.h
@@ -0,0 +1,232 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/gather.h"
+#include "paddle/operators/scatter.h"
+
+namespace paddle {
+namespace operators {
+
+using namespace paddle::framework;
+
+class CondOp : public OperatorBase {
+ public:
+  CondOp(const std::string& type, const VariableNameMap& inputs,
+         const VariableNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    index_.resize(2);
+    sub_net_op_.resize(2);
+    LOG(INFO) << "Initialization Done.";
+  }
+
+  CondOp(const CondOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    // TODO(yuyang18): Implement copy ctor well.
+    PADDLE_THROW("Not implemented");
+  }
+
+  void CreateScope(const Scope& scope) const {
+    auto sub_scopes_var = scope.FindVar("SubScopes");
+    PADDLE_ENFORCE(sub_scopes_var != nullptr, "");
+    auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
+    auto& sub_scope = scope.NewScope();
+    sub_scopes->push_back(&sub_scope);
+  }
+
+  void CreateIndexTensor(const Scope& scope) const {
+    auto index_tensors_var = scope.FindVar("IndexTensors");
+    PADDLE_ENFORCE(index_tensors_var != nullptr, "");
+    auto& index_tensors =
+        *index_tensors_var->GetMutable<std::vector<Tensor*>>();
+    Tensor index_tensor;
+    index_tensors.push_back(&index_tensor);
+  }
+
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const framework::Scope& scope) const override {
+    auto sub_scopes_var = scope.FindVar("SubScopes");
+    PADDLE_ENFORCE_NOT_NULL(sub_scopes_var);
+    auto& sub_scopes = *sub_scopes_var->GetMutable<std::vector<Scope*>>();
+    // auto& index_tensors =
+    // *scope.FindVar("IndexTensors")->GetMutable<std::vector<Tensor*>>();
+
+    for (int i = 0; i < 2; ++i) {
+      // Create two sub scopes for true and false branches
+      // sub_scopes[0] for the true branch and sub_scopes[1] for the false
+      // branch
+      CreateScope(scope);
+
+      // Create two tensors for true and false indices
+      // index_tensors[0] for the true branch and index_tensors[1] for the false
+      // branch
+      CreateIndexTensor(scope);
+
+      for (auto& input : Inputs("Xs")) {
+        // Create a new tensor in sub-scope for input-type tensor
+        Variable* v = sub_scopes[i]->NewVar(input);
+        Tensor* sub_input = v->GetMutable<Tensor>();
+        sub_input->Resize(scope.FindVar(input)->GetMutable<Tensor>()->dims());
+      }
+
+      // Inputs that do not require tailoring
+      /*for (auto& input : (*sub_net_op_[i]).Inputs()) {
+        // weights are located in the parent scope rather than sub scope
+        for (auto& var_name : input.second) {
+          if (!sub_scopes[i]->FindVar(var_name)) {
+            sub_scopes[i]->NewVar(var_name)->GetMutable<Tensor>();
+          }
+        }
+      }*/
+
+      // Outputs
+      for (auto& output : (*sub_net_op_[i]).Outputs()) {
+        for (auto& var_name : output.second) {
+          sub_scopes[i]->NewVar(var_name);
+        }
+      }
+
+      // each net calls InferShape
+      LOG(INFO) << "OK 3";
+      sub_net_op_[i]->InferShape(*sub_scopes[i]);
+      LOG(INFO) << "OK 4";
+    }
+
+    for (auto& output : Outputs("Outs")) {
+      Tensor* tensor_t_out =
+          sub_scopes[0]->FindVar(output)->GetMutable<Tensor>();
+      Tensor* tensor_f_out =
+          sub_scopes[1]->FindVar(output)->GetMutable<Tensor>();
+      Tensor* tensor_out = scope.FindVar(output)->GetMutable<Tensor>();
+      // check output size should be same
+      PADDLE_ENFORCE_EQ(tensor_t_out->dims(), tensor_f_out->dims(),
+                        "Outputs not of the same shape");
+      tensor_out->Resize(tensor_t_out->dims());
+    }
+    LOG(INFO) << "OK 5";
+  }
+
+  // Set True Block
+  void set_truenet(std::unique_ptr<OperatorBase> net) {
+    sub_net_op_[0] = std::move(net);
+  }
+
+  // Set False Block
+  void set_falsenet(std::unique_ptr<OperatorBase> net) {
+    sub_net_op_[1] = std::move(net);
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    auto sub_scopes = scope.FindVar("SubScopes")->Get<std::vector<Scope*>>();
+    auto index_tensors =
+        scope.FindVar("IndexTensors")->Get<std::vector<Tensor*>>();
+
+    std::string cond_name = Input("Cond");
+    Variable* cond_var = scope.FindVar(cond_name);
+    PADDLE_ENFORCE_NOT_NULL(cond_var)
+    const Tensor* cond = cond_var->GetMutable<Tensor>();
+
+    // Step 1: get the true/false index at runtime
+    // index_[0]: vector<int>, contains all index for cond[i] == true
+    // index_[1]: vector<int>, contains all index for cond[i] == false
+    for (int i = 0; i < 2; ++i) index_[i].clear();
+
+    const bool* cond_data = cond->data<bool>();
+    for (int i = 0; i < cond->dims()[0]; ++i) {
+      if (cond_data[i])
+        index_[0].push_back(i);
+      else
+        index_[1].push_back(i);
+    }
+    // put index_[0] and index_[1] into two tensors:
+    // index_tensor_[0] and index_tensor_[1]
+    framework::DDim dim = paddle::framework::make_ddim({0});
+    for (int i = 0; i < 2; ++i) {
+      dim[0] = index_[i].size();
+      int* tmp_ptr =
+          index_tensors[i]->mutable_data<int>(dim, platform::CPUPlace());
+      index_tensors[i]->Resize(dim);
+      memcpy(tmp_ptr, index_[i].data(), dim[0] * sizeof(int));
+    }
+
+    // Step 2: collect data by calling gather
+    for (int i = 0; i < 2; ++i) {
+      // i= 0/i for True and False branches respectively
+      for (auto& input : Inputs("Xs")) {
+        // find Tensor
+        // Tensor* tensor_parent = scope.FindVar(input)->GetMutable<Tensor>();
+        Variable* v = scope.FindVar(input);
+        Tensor* tensor_parent = v->GetMutable<Tensor>();
+        // Tensor* tensor_child =
+        // sub_scope_[i].FindVar(input)->GetMutable<Tensor>();
+        v = sub_scopes[i]->FindVar(input);
+        Tensor* tensor_child = v->GetMutable<Tensor>();
+        Gather<float>(dev_ctx.GetPlace(), tensor_parent, index_tensors[i],
+                      tensor_child);
+      }
+    }
+
+    // Step 3: run
+    for (int i = 0; i < 2; ++i) sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+
+    // Step 4: merge output results
+    for (int i = 0; i < 2; ++i) {
+      // i= 0/i for True and False branches respectively
+      // for (auto& output : GetAttr<std::vector<std::string>>("sub_outputs")) {
+      for (auto& output : Outputs("Outs")) {
+        // find Tensor
+        Variable* v = scope.FindVar(output);
+        Tensor* tensor_parent = v->GetMutable<Tensor>();
+        v = sub_scopes[i]->FindVar(output);
+        Tensor* tensor_child = v->GetMutable<Tensor>();
+        ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, index_tensors[i],
+                             tensor_parent);
+      }
+    }
+  }
+
+ private:
+  // sub_net_op_[0]: subnet_t
+  // sub_net_op_[1]: subnet_f
+  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
+
+  // index_[0]: True_index;
+  // index_[1]: False_index;
+  mutable std::vector<std::vector<int>> index_;
+};
+
+/*
+class CondGradientOp final : public OperatorBase {
+public:
+        void Init() override;
+
+        virtual void InferShape(const std::shared_ptr<Scope>& scope) const
+override;
+
+        virtual void Run(const std::shared_ptr<Scope>& scope,
+                   const platform::DeviceContext& dev_ctx) const override;
+};*/
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 16a2368aae..3eeae856fb 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -41,6 +41,7 @@ USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
 USE_NO_KERNEL_OP(recurrent);
+USE_NO_KERNEL_OP(cond);
 USE_OP(gaussian_random);
 USE_OP(uniform_random);
 USE_OP(lookup_table);
@@ -324,6 +325,28 @@ All parameter, weight, gradient are variables in Paddle.
            [](operators::RecurrentOp &self, const operators::NetOp &net)
                -> void { self.set_stepnet(net.Clone()); });
 
+  // cond_op
+  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
+      .def_static("create",
+                  [](py::bytes protobin) -> operators::CondOp * {
+                    OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    auto cond_op = OpRegistry::CreateOp(desc);
+                    return static_cast<operators::CondOp *>(cond_op.release());
+                  })
+      .def("set_truenet",
+           [](operators::CondOp &self, const operators::NetOp &net) -> void {
+             self.set_truenet(net.Clone());
+           })
+      .def("set_falsenet",
+           [](operators::CondOp &self, const operators::NetOp &net) -> void {
+             self.set_falsenet(net.Clone());
+           });
+
   m.def("unique_integer", UniqueIntegerGenerator);
 
   m.def("is_compile_gpu", IsCompileGPU);
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index 9e665adad2..bddd4d8908 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -215,5 +215,27 @@ class __RecurrentOp__(object):
         return core.RecurrentOp.create(proto.SerializeToString())
 
 
+class __CondOp__(object):
+    __proto__ = None
+    type = 'cond_op'
+
+    def __init__(self):
+        # cache recurrent_op's proto
+        if self.__proto__ is None:
+            for op_proto in get_all_op_protos():
+                if op_proto.type == self.type:
+                    self.__proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        if self.type not in args and 'type' not in kwargs:
+            kwargs['type'] = self.type
+        # create proto
+        create_method = OpDescCreationMethod(self.__proto__)
+        proto = create_method(*args, **kwargs)
+        # create condop
+        return core.CondOp.create(proto.SerializeToString())
+
+
 Operator = OperatorFactory()  # The default global factory
 RecurrentOp = __RecurrentOp__()
+CondOp = __CondOp__()
diff --git a/python/paddle/v2/framework/tests/test_cond_op.py b/python/paddle/v2/framework/tests/test_cond_op.py
new file mode 100644
index 0000000000..1fe5889b7f
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
@@ -0,0 +1,114 @@
+import logging
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+from paddle.v2.framework.op import Operator, CondOp
+
+
+class PySimpleCond(object):
+    '''
+    A simple implementation of dynamic if-else based on numpy
+    '''
+
+    def __init__(self):
+        array = [True] * 10
+        for i in range(1, 10, 2):
+            array[i] = False
+        self.cond = np.array(array)
+        self.x = np.ones(shape=(10, 1))
+
+    def forward(self):
+        self.index_t = np.where(self.cond)
+        self.index_f = np.where(self.cond == False)
+        y_t = self.x[self.index_t]
+        y_f = self.x[self.index_f]
+        y_t = y_t * 2.
+        y_f = y_f * (-2.)
+        output = np.zeros(shape=(10, 1))
+        output[self.index_t] = y_t
+        output[self.index_f] = y_f
+        return output
+
+
+class PySimpleCondTest(unittest.TestCase):
+    def setUp(self):
+        self.condnn = PySimpleCond()
+
+    def test_forward(self):
+        output = self.condnn.forward()
+        print 'output', output
+
+
+def create_tensor(scope, name, shape, np_data):
+    tensor = scope.new_var(name).get_tensor()
+    tensor.set_dims(shape)
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+
+
+class TestCondOp(unittest.TestCase):
+    '''
+    Test CondOp
+
+    equation:
+        cond = [True, False, True, False, ...]
+        y[index_t] = x[index_t] * 2.
+        y[index_f] = x[index_f] * -2.
+    outputs:
+        y
+    '''
+
+    def setUp(self):
+        self.py_cond = PySimpleCond()
+
+    def forward(self):
+        self.scope = core.Scope()
+        self.create_global_variables()
+        self.create_cond_op()
+        self.create_sub_net()
+        ctx = core.DeviceContext.create(core.CPUPlace())
+        print 'running infer shape'
+        print self.scope.find_var("SubScopes")
+        self.condop.infer_shape(self.scope)
+        print 'ok 2'
+        self.condop.run(self.scope, ctx)
+        print 'ok 3'
+        return np.array(self.scope.find_var("Outs").get_tensor())
+
+    def create_global_variables(self):
+        x_np_data = self.py_cond.x
+        create_tensor(self.scope, "x", [10, 1], x_np_data)
+        cond_np_data = self.py_cond.cond
+        create_tensor(self.scope, "cond", [10, 1], x_np_data)
+        self.scope.new_var("SubScopes")
+        self.scope.new_var("IndexTensors")
+        self.scope.new_var("Outs")
+
+    def create_cond_op(self):
+        self.condop = CondOp(
+            Cond="cond",
+            Xs=["x"],
+            Outs=['Out_final'],
+            SubScopes="SubScopes",
+            IndexTensors="IndexTensors")
+
+    def create_sub_net(self):
+        truenet = core.Net.create()
+        scale_op_t = Operator("scale", X='X', Y='Out', scale=2.)
+        truenet.append_op(scale_op_t)
+        truenet.complete_add_op(True)
+        self.condop.set_truenet(truenet)
+
+        falsenet = core.Net.create()
+        scale_op_t = Operator("scale", X='X', Y='Out', scale=-2.)
+        falsenet.append_op(scale_op_t)
+        falsenet.complete_add_op(True)
+        self.condop.set_falsenet(falsenet)
+
+    def test_forward(self):
+        print 'test cond op forward'
+        py_output = self.forward()
+
+
+if __name__ == "__main__":
+    unittest.main()

From f8c6792aa3ac17135f33c2de01f693ea781e1212 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 12 Sep 2017 19:44:13 -0700
Subject: [PATCH 07/17] Extract DevPtrCast to device_ptr_cast.h

---
 paddle/platform/details/device_ptr_cast.h | 56 +++++++++++++++++++++++
 paddle/platform/transform.h               | 40 +++-------------
 2 files changed, 63 insertions(+), 33 deletions(-)
 create mode 100644 paddle/platform/details/device_ptr_cast.h

diff --git a/paddle/platform/details/device_ptr_cast.h b/paddle/platform/details/device_ptr_cast.h
new file mode 100644
index 0000000000..4015491fcd
--- /dev/null
+++ b/paddle/platform/details/device_ptr_cast.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef __NVCC__
+#error device_ptr_cast must be include by .cu file
+#endif
+
+#include <thrust/device_ptr.h>
+
+namespace paddle {
+namespace platform {
+namespace details {
+template <typename T, bool is_ptr>
+struct DevicePtrCast;
+
+template <typename T>
+struct DevicePtrCast<T, true> {
+  using ELEM = typename std::remove_pointer<T>::type;
+  using RTYPE = thrust::device_ptr<ELEM>;
+
+  inline thrust::device_ptr<ELEM> operator()(ELEM* ele) const {
+    return thrust::device_pointer_cast(ele);
+  }
+};
+
+template <typename T>
+struct DevicePtrCast<T, false> {
+  using RTYPE = T;
+  inline RTYPE operator()(RTYPE it) const { return it; }
+};
+
+// Cast T to thrust::device_ptr if T is a pointer.
+// Otherwise, e.g., T is a iterator, return T itself.
+template <typename T>
+auto DevPtrCast(T t) ->
+    typename DevicePtrCast<T, std::is_pointer<T>::value>::RTYPE {
+  DevicePtrCast<T, std::is_pointer<T>::value> cast;
+  return cast(t);
+}
+
+}  // namespace details
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
index c80446b45c..3ee4acd296 100644
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -21,41 +21,12 @@
 #include <algorithm>
 #include <type_traits>
 #ifdef __NVCC__
-#include <thrust/device_ptr.h>
 #include <thrust/transform.h>
+#include "paddle/platform/details/device_ptr_cast.h"
 #endif
 
 namespace paddle {
 namespace platform {
-
-#ifdef __NVCC__
-template <typename T, bool is_ptr>
-struct DevicePtrCast;
-
-template <typename T>
-struct DevicePtrCast<T, true> {
-  using ELEM = typename std::remove_pointer<T>::type;
-  using RTYPE = thrust::device_ptr<ELEM>;
-
-  inline thrust::device_ptr<ELEM> operator()(ELEM* ele) const {
-    return thrust::device_pointer_cast(ele);
-  }
-};
-
-template <typename T>
-struct DevicePtrCast<T, false> {
-  using RTYPE = T;
-  inline RTYPE operator()(RTYPE it) const { return it; }
-};
-
-template <typename T>
-auto DevCast(T t) ->
-    typename DevicePtrCast<T, std::is_pointer<T>::value>::RTYPE {
-  DevicePtrCast<T, std::is_pointer<T>::value> cast;
-  return cast(t);
-}
-#endif
-
 // Transform on host or device. It provides the same API in std library.
 template <typename Place, typename InputIter, typename OutputIter,
           typename UnaryOperation>
@@ -65,7 +36,9 @@ void Transform(Place place, InputIter first, InputIter last, OutputIter result,
     std::transform(first, last, result, op);
   } else {
 #ifdef __NVCC__
-    thrust::transform(DevCast(first), DevCast(last), DevCast(result), op);
+    using namespace details;
+    thrust::transform(DevPtrCast(first), DevPtrCast(last), DevPtrCast(result),
+                      op);
 #else
     PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
 #endif
@@ -80,8 +53,9 @@ void Transform(Place place, InputIter1 first1, InputIter1 last1,
     std::transform(first1, last1, first2, result, op);
   } else {
 #ifdef __NVCC__
-    thrust::transform(DevCast(first1), DevCast(last1), DevCast(first2),
-                      DevCast(result), op);
+    using namespace details;
+    thrust::transform(DevPtrCast(first1), DevPtrCast(last1), DevPtrCast(first2),
+                      DevPtrCast(result), op);
 #else
     PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
 #endif

From c7db6e8d146df415ad0011afac7e4d2562f83dcb Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Wed, 13 Sep 2017 14:14:49 -0700
Subject: [PATCH 08/17] cond op passed

---
 paddle/operators/cond_op.cc                   | 166 ++++++++++++++++-
 paddle/operators/cond_op.h                    | 173 +-----------------
 paddle/pybind/pybind.cc                       |   1 +
 python/paddle/v2/framework/op.py              |   6 +-
 .../paddle/v2/framework/tests/CMakeLists.txt  |   1 +
 .../paddle/v2/framework/tests/test_cond_op.py |  40 ++--
 6 files changed, 198 insertions(+), 189 deletions(-)

diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index cb7fed7ebd..a3e4a2506f 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -13,15 +13,175 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/cond_op.h"
+
+#include <cstring>
+#include <sstream>
+
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/gather.h"
 #include "paddle/operators/net_op.h"
+#include "paddle/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
 
-class CondOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+using Scope = framework::Scope;
+using Variable = framework::Variable;
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+void CondOp::CreateScope(const Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE(sub_scopes_var != nullptr, "");
+  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
+  auto& sub_scope = scope.NewScope();
+  sub_scopes->push_back(&sub_scope);
+}
+
+void CondOp::CreateIndexTensor(const Scope& scope) const {
+  auto index_tensors_var = scope.FindVar("IndexTensors");
+  PADDLE_ENFORCE(index_tensors_var != nullptr, "");
+  auto& index_tensors = *index_tensors_var->GetMutable<std::vector<Tensor>>();
+  index_tensors.push_back(Tensor());
+}
+
+void CondOp::InferShape(const Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var);
+  auto& sub_scopes = *sub_scopes_var->GetMutable<std::vector<Scope*>>();
+
+  for (int i = 0; i < 2; ++i) {
+    // Create two sub scopes for true and false branches
+    // sub_scopes[0] for the true branch and sub_scopes[1] for the false
+    // branch
+    CreateScope(scope);
+
+    // Create two tensors for true and false indices
+    // index_tensors[0] for the true branch and index_tensors[1] for the false
+    // branch
+    CreateIndexTensor(scope);
+
+    PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs can't be empty");
+    for (auto& input : Inputs("Xs")) {
+      // Create a new tensor in sub-scope for input-type tensor
+      Variable* v = sub_scopes[i]->NewVar(input);
+      Tensor* sub_input = v->GetMutable<Tensor>();
+      sub_input->Resize(scope.FindVar(input)->GetMutable<Tensor>()->dims());
+    }
+
+    for (auto& output : (*sub_net_op_[i]).Outputs()) {
+      for (auto& var_name : output.second) {
+        sub_scopes[i]->NewVar(var_name);
+      }
+    }
+
+    // each net calls InferShape
+    sub_net_op_[i]->InferShape(*sub_scopes[i]);
+  }
+
+  for (auto& output : Outputs("Outs")) {
+    Tensor* tensor_t_out = sub_scopes[0]->FindVar(output)->GetMutable<Tensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should be NULL");
+    Tensor* tensor_f_out = sub_scopes[1]->FindVar(output)->GetMutable<Tensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "True output should be NULL");
+
+    auto* tensor_out_var = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(tensor_out_var, "Output not found");
+    Tensor* tensor_out = tensor_out_var->GetMutable<Tensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should be NULL");
+    // check output size should be same
+    PADDLE_ENFORCE_EQ(tensor_t_out->dims(), tensor_f_out->dims(),
+                      "Outputs not of the same shape");
+    tensor_out->Resize(tensor_t_out->dims());
+    tensor_out->mutable_data<float>(tensor_out->dims(), platform::CPUPlace());
+  }
+}
+
+void CondOp::Run(const Scope& scope,
+                 const platform::DeviceContext& dev_ctx) const {
+  auto sub_scopes = scope.FindVar("SubScopes")->Get<std::vector<Scope*>>();
+  auto index_tensors =
+      scope.FindVar("IndexTensors")->Get<std::vector<Tensor>>();
+
+  std::string cond_name = Input("Cond");
+  Variable* cond_var = scope.FindVar(cond_name);
+  PADDLE_ENFORCE_NOT_NULL(cond_var);
+  const Tensor* cond = cond_var->GetMutable<Tensor>();
+
+  // Step 1: get the true/false index at runtime
+  // index_[0]: vector<int>, contains all index for cond[i] == true
+  // index_[1]: vector<int>, contains all index for cond[i] == false
+  for (int i = 0; i < 2; ++i) index_[i].clear();
+
+  const int* cond_data = cond->data<int>();
+  for (int i = 0; i < cond->dims()[0]; ++i) {
+    if (cond_data[i])
+      index_[0].push_back(i);
+    else
+      index_[1].push_back(i);
+  }
+
+  // put index_[0] and index_[1] into two tensors:
+  // index_tensor_[0] and index_tensor_[1]
+  DDim dim = paddle::framework::make_ddim({0});
+  for (int i = 0; i < 2; ++i) {
+    dim[0] = index_[i].size();
+    int* tmp_ptr =
+        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
+    index_tensors[i].Resize(dim);
+    memcpy(tmp_ptr, index_[i].data(), dim[0] * sizeof(int));
+  }
+
+  // Step 2: collect data by calling gather
+  for (int i = 0; i < 2; ++i) {
+    // i= 0/i for True and False branches respectively
+    for (auto& input : Inputs("Xs")) {
+      // find Tensor
+      Variable* v = scope.FindVar(input);
+      PADDLE_ENFORCE_NOT_NULL(v);
+      Tensor* tensor_parent = v->GetMutable<Tensor>();
+
+      v = sub_scopes[i]->FindVar(input);
+      PADDLE_ENFORCE_NOT_NULL(v);
+      Tensor* tensor_child = v->GetMutable<Tensor>();
+
+      // Resize child
+      DDim dim = tensor_child->dims();
+      dim[0] = index_[i].size();
+      tensor_child->Resize(dim);
+      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
+
+      Gather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
+                    tensor_child);
+    }
+  }
+
+  // Step 3: run
+  for (int i = 0; i < 2; ++i) sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+
+  // Step 4: merge output results
+  for (int i = 0; i < 2; ++i) {
+    // i= 0/i for True and False branches respectively
+    for (auto& output : Outputs("Outs")) {
+      // find Tensor
+      Variable* v = scope.FindVar(output);
+      PADDLE_ENFORCE_NOT_NULL(v);
+      Tensor* tensor_parent = v->GetMutable<Tensor>();
+
+      v = sub_scopes[i]->FindVar(output);
+      PADDLE_ENFORCE_NOT_NULL(v);
+      Tensor* tensor_child = v->GetMutable<Tensor>();
+
+      ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i],
+                           tensor_parent);
+    }
+  }
+}
+
+class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CondOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
+  CondOpProtoAndCheckerMaker(framework::OpProto* proto,
+                             framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Cond", "The condition, which is a bool vector");
     AddInput("Xs", "Inputs of Subnets").AsDuplicable();
@@ -41,5 +201,5 @@ Out[i] = subnet_t[i], if Cond[i] == false
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(cond_op, paddle::operators::CondOp,
+REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
                              paddle::operators::CondOpProtoAndCheckerMaker);
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
index b776f8ccd9..27a6e9e3c3 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -19,22 +19,19 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/tensor.h"
-#include "paddle/operators/gather.h"
-#include "paddle/operators/scatter.h"
+#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
 
-using namespace paddle::framework;
-
-class CondOp : public OperatorBase {
+class CondOp : public framework::OperatorBase {
  public:
-  CondOp(const std::string& type, const VariableNameMap& inputs,
-         const VariableNameMap& outputs, const AttributeMap& attrs)
+  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {
     index_.resize(2);
     sub_net_op_.resize(2);
-    LOG(INFO) << "Initialization Done.";
   }
 
   CondOp(const CondOp& o)
@@ -44,87 +41,14 @@ class CondOp : public OperatorBase {
     PADDLE_THROW("Not implemented");
   }
 
-  void CreateScope(const Scope& scope) const {
-    auto sub_scopes_var = scope.FindVar("SubScopes");
-    PADDLE_ENFORCE(sub_scopes_var != nullptr, "");
-    auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
-    auto& sub_scope = scope.NewScope();
-    sub_scopes->push_back(&sub_scope);
-  }
+  void CreateScope(const framework::Scope& scope) const;
 
-  void CreateIndexTensor(const Scope& scope) const {
-    auto index_tensors_var = scope.FindVar("IndexTensors");
-    PADDLE_ENFORCE(index_tensors_var != nullptr, "");
-    auto& index_tensors =
-        *index_tensors_var->GetMutable<std::vector<Tensor*>>();
-    Tensor index_tensor;
-    index_tensors.push_back(&index_tensor);
-  }
+  void CreateIndexTensor(const framework::Scope& scope) const;
 
   /**
    * InferShape must be called before Run.
    */
-  void InferShape(const framework::Scope& scope) const override {
-    auto sub_scopes_var = scope.FindVar("SubScopes");
-    PADDLE_ENFORCE_NOT_NULL(sub_scopes_var);
-    auto& sub_scopes = *sub_scopes_var->GetMutable<std::vector<Scope*>>();
-    // auto& index_tensors =
-    // *scope.FindVar("IndexTensors")->GetMutable<std::vector<Tensor*>>();
-
-    for (int i = 0; i < 2; ++i) {
-      // Create two sub scopes for true and false branches
-      // sub_scopes[0] for the true branch and sub_scopes[1] for the false
-      // branch
-      CreateScope(scope);
-
-      // Create two tensors for true and false indices
-      // index_tensors[0] for the true branch and index_tensors[1] for the false
-      // branch
-      CreateIndexTensor(scope);
-
-      for (auto& input : Inputs("Xs")) {
-        // Create a new tensor in sub-scope for input-type tensor
-        Variable* v = sub_scopes[i]->NewVar(input);
-        Tensor* sub_input = v->GetMutable<Tensor>();
-        sub_input->Resize(scope.FindVar(input)->GetMutable<Tensor>()->dims());
-      }
-
-      // Inputs that do not require tailoring
-      /*for (auto& input : (*sub_net_op_[i]).Inputs()) {
-        // weights are located in the parent scope rather than sub scope
-        for (auto& var_name : input.second) {
-          if (!sub_scopes[i]->FindVar(var_name)) {
-            sub_scopes[i]->NewVar(var_name)->GetMutable<Tensor>();
-          }
-        }
-      }*/
-
-      // Outputs
-      for (auto& output : (*sub_net_op_[i]).Outputs()) {
-        for (auto& var_name : output.second) {
-          sub_scopes[i]->NewVar(var_name);
-        }
-      }
-
-      // each net calls InferShape
-      LOG(INFO) << "OK 3";
-      sub_net_op_[i]->InferShape(*sub_scopes[i]);
-      LOG(INFO) << "OK 4";
-    }
-
-    for (auto& output : Outputs("Outs")) {
-      Tensor* tensor_t_out =
-          sub_scopes[0]->FindVar(output)->GetMutable<Tensor>();
-      Tensor* tensor_f_out =
-          sub_scopes[1]->FindVar(output)->GetMutable<Tensor>();
-      Tensor* tensor_out = scope.FindVar(output)->GetMutable<Tensor>();
-      // check output size should be same
-      PADDLE_ENFORCE_EQ(tensor_t_out->dims(), tensor_f_out->dims(),
-                        "Outputs not of the same shape");
-      tensor_out->Resize(tensor_t_out->dims());
-    }
-    LOG(INFO) << "OK 5";
-  }
+  void InferShape(const framework::Scope& scope) const override;
 
   // Set True Block
   void set_truenet(std::unique_ptr<OperatorBase> net) {
@@ -137,74 +61,7 @@ class CondOp : public OperatorBase {
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
-    auto sub_scopes = scope.FindVar("SubScopes")->Get<std::vector<Scope*>>();
-    auto index_tensors =
-        scope.FindVar("IndexTensors")->Get<std::vector<Tensor*>>();
-
-    std::string cond_name = Input("Cond");
-    Variable* cond_var = scope.FindVar(cond_name);
-    PADDLE_ENFORCE_NOT_NULL(cond_var)
-    const Tensor* cond = cond_var->GetMutable<Tensor>();
-
-    // Step 1: get the true/false index at runtime
-    // index_[0]: vector<int>, contains all index for cond[i] == true
-    // index_[1]: vector<int>, contains all index for cond[i] == false
-    for (int i = 0; i < 2; ++i) index_[i].clear();
-
-    const bool* cond_data = cond->data<bool>();
-    for (int i = 0; i < cond->dims()[0]; ++i) {
-      if (cond_data[i])
-        index_[0].push_back(i);
-      else
-        index_[1].push_back(i);
-    }
-    // put index_[0] and index_[1] into two tensors:
-    // index_tensor_[0] and index_tensor_[1]
-    framework::DDim dim = paddle::framework::make_ddim({0});
-    for (int i = 0; i < 2; ++i) {
-      dim[0] = index_[i].size();
-      int* tmp_ptr =
-          index_tensors[i]->mutable_data<int>(dim, platform::CPUPlace());
-      index_tensors[i]->Resize(dim);
-      memcpy(tmp_ptr, index_[i].data(), dim[0] * sizeof(int));
-    }
-
-    // Step 2: collect data by calling gather
-    for (int i = 0; i < 2; ++i) {
-      // i= 0/i for True and False branches respectively
-      for (auto& input : Inputs("Xs")) {
-        // find Tensor
-        // Tensor* tensor_parent = scope.FindVar(input)->GetMutable<Tensor>();
-        Variable* v = scope.FindVar(input);
-        Tensor* tensor_parent = v->GetMutable<Tensor>();
-        // Tensor* tensor_child =
-        // sub_scope_[i].FindVar(input)->GetMutable<Tensor>();
-        v = sub_scopes[i]->FindVar(input);
-        Tensor* tensor_child = v->GetMutable<Tensor>();
-        Gather<float>(dev_ctx.GetPlace(), tensor_parent, index_tensors[i],
-                      tensor_child);
-      }
-    }
-
-    // Step 3: run
-    for (int i = 0; i < 2; ++i) sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
-
-    // Step 4: merge output results
-    for (int i = 0; i < 2; ++i) {
-      // i= 0/i for True and False branches respectively
-      // for (auto& output : GetAttr<std::vector<std::string>>("sub_outputs")) {
-      for (auto& output : Outputs("Outs")) {
-        // find Tensor
-        Variable* v = scope.FindVar(output);
-        Tensor* tensor_parent = v->GetMutable<Tensor>();
-        v = sub_scopes[i]->FindVar(output);
-        Tensor* tensor_child = v->GetMutable<Tensor>();
-        ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, index_tensors[i],
-                             tensor_parent);
-      }
-    }
-  }
+           const platform::DeviceContext& dev_ctx) const override;
 
  private:
   // sub_net_op_[0]: subnet_t
@@ -216,17 +73,5 @@ class CondOp : public OperatorBase {
   mutable std::vector<std::vector<int>> index_;
 };
 
-/*
-class CondGradientOp final : public OperatorBase {
-public:
-        void Init() override;
-
-        virtual void InferShape(const std::shared_ptr<Scope>& scope) const
-override;
-
-        virtual void Run(const std::shared_ptr<Scope>& scope,
-                   const platform::DeviceContext& dev_ctx) const override;
-};*/
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 3eeae856fb..34214ad2b3 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/framework/backward.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/cond_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index bddd4d8908..1469d207d4 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -217,7 +217,7 @@ class __RecurrentOp__(object):
 
 class __CondOp__(object):
     __proto__ = None
-    type = 'cond_op'
+    type = "cond"
 
     def __init__(self):
         # cache recurrent_op's proto
@@ -227,8 +227,8 @@ class __CondOp__(object):
                     self.__proto__ = op_proto
 
     def __call__(self, *args, **kwargs):
-        if self.type not in args and 'type' not in kwargs:
-            kwargs['type'] = self.type
+        if self.type not in args and "type" not in kwargs:
+            kwargs["type"] = self.type
         # create proto
         create_method = OpDescCreationMethod(self.__proto__)
         proto = create_method(*args, **kwargs)
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 6b22c00082..a2e3e978c7 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -27,6 +27,7 @@ py_test(test_operator SRCS test_operator.py)
 py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
 py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
 py_test(test_recurrent_op SRCS test_recurrent_op.py)
+py_test(test_cond_op SRCS test_cond_op.py)
 py_test(test_sgd_op SRCS test_sgd_op.py)
 py_test(test_gradient_checker SRCS test_gradient_checker.py)
 py_test(test_lookup_table SRCS test_lookup_table.py)
diff --git a/python/paddle/v2/framework/tests/test_cond_op.py b/python/paddle/v2/framework/tests/test_cond_op.py
index 1fe5889b7f..37177ae0b2 100644
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
@@ -11,15 +11,15 @@ class PySimpleCond(object):
     '''
 
     def __init__(self):
-        array = [True] * 10
+        array = [1] * 10
         for i in range(1, 10, 2):
-            array[i] = False
+            array[i] = 0
         self.cond = np.array(array)
         self.x = np.ones(shape=(10, 1))
 
     def forward(self):
-        self.index_t = np.where(self.cond)
-        self.index_f = np.where(self.cond == False)
+        self.index_t = np.where(self.cond == 1)
+        self.index_f = np.where(self.cond == 0)
         y_t = self.x[self.index_t]
         y_f = self.x[self.index_f]
         y_t = y_t * 2.
@@ -36,7 +36,6 @@ class PySimpleCondTest(unittest.TestCase):
 
     def test_forward(self):
         output = self.condnn.forward()
-        print 'output', output
 
 
 def create_tensor(scope, name, shape, np_data):
@@ -67,47 +66,50 @@ class TestCondOp(unittest.TestCase):
         self.create_cond_op()
         self.create_sub_net()
         ctx = core.DeviceContext.create(core.CPUPlace())
-        print 'running infer shape'
-        print self.scope.find_var("SubScopes")
         self.condop.infer_shape(self.scope)
-        print 'ok 2'
         self.condop.run(self.scope, ctx)
-        print 'ok 3'
-        return np.array(self.scope.find_var("Outs").get_tensor())
+        return np.array(self.scope.find_var("Out").get_tensor())
 
     def create_global_variables(self):
         x_np_data = self.py_cond.x
-        create_tensor(self.scope, "x", [10, 1], x_np_data)
-        cond_np_data = self.py_cond.cond
-        create_tensor(self.scope, "cond", [10, 1], x_np_data)
+        create_tensor(self.scope, "X", [10, 1], x_np_data)
+        cond_np_data = self.py_cond.cond.astype("int32")
+        create_tensor(self.scope, "cond", [10, 1], cond_np_data)
         self.scope.new_var("SubScopes")
         self.scope.new_var("IndexTensors")
-        self.scope.new_var("Outs")
+        self.scope.new_var("Out")
 
     def create_cond_op(self):
         self.condop = CondOp(
             Cond="cond",
-            Xs=["x"],
-            Outs=['Out_final'],
+            Xs=["X"],
+            Outs=["Out"],
             SubScopes="SubScopes",
             IndexTensors="IndexTensors")
 
     def create_sub_net(self):
         truenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Y='Out', scale=2.)
+        scale_op_t = Operator("scale", X='X', Out='Out', scale=2.)
         truenet.append_op(scale_op_t)
         truenet.complete_add_op(True)
         self.condop.set_truenet(truenet)
 
         falsenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Y='Out', scale=-2.)
+        scale_op_t = Operator("scale", X='X', Out='Out', scale=-2.)
         falsenet.append_op(scale_op_t)
         falsenet.complete_add_op(True)
         self.condop.set_falsenet(falsenet)
 
     def test_forward(self):
         print 'test cond op forward'
-        py_output = self.forward()
+        pd_output = self.forward()
+        py_output = self.py_cond.forward()
+        print 'pd_output', pd_output
+        print
+        print 'py_output', py_output
+        self.assertEqual(pd_output.shape, py_output.shape)
+        print 'test passed'
+        return 0
 
 
 if __name__ == "__main__":

From 2c8cbb8c3fd779a9993afd623e4ee53ed047a4ec Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Wed, 13 Sep 2017 15:39:29 -0700
Subject: [PATCH 09/17] if_else_op.md

---
 doc/design/if_else_op.md | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/doc/design/if_else_op.md b/doc/design/if_else_op.md
index 7370c2a24f..954a19c073 100644
--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
@@ -1,22 +1,4 @@
-IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has M (M<=N) instances, each corresponds to a true element in `cond`.
-
-```python
-import paddle as pd
-
-x = var()
-y = var()
-cond = var()
-
-b = pd.create_ifop(inputs=[x], output_num=1)
-with b.true_block():
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
-
-out = b(cond)
-```
-
-If we want the output still has N instances, we can use IfElseOp with a default value, whose minibatch size must be N:
+IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack().
 
 ```python
 import paddle as pd
@@ -39,7 +21,7 @@ with b.false_block():
 out = b(cond)
 ```
 
-If only true_block is set in an IfElseOp, we can have a default value for false as:
+If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as:
 ```python
 import paddle as pd
 

From 59d661b9a93f214fd0dc10e3d032a7d9e6442e00 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 14 Sep 2017 17:11:48 +0800
Subject: [PATCH 10/17] Fix enforce test failed

Note: If no symbol with a suitable value is found, both this field and dli_saddr shall be set to NULL.
---
 paddle/platform/enforce.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 64fcbd93b6..df5f71ed76 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -25,6 +25,10 @@ limitations under the License. */
 #include "paddle/string/printf.h"
 #include "paddle/string/to_string.h"
 
+#ifdef __GNUC__
+#include <cxxabi.h>  // for __cxa_demangle
+#endif
+
 #ifndef PADDLE_ONLY_CPU
 
 #include "paddle/platform/dynload/cublas.h"
@@ -42,6 +46,19 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+namespace {
+#ifdef __GNUC__
+inline std::string demangle(std::string name) {
+  int status = -4;  // some arbitrary value to eliminate the compiler warning
+  std::unique_ptr<char, void (*)(void*)> res{
+      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
+  return (status == 0) ? res.get() : name;
+}
+#else
+inline std::string demangle(std::string name) { return name; }
+#endif
+}
+
 struct EnforceNotMet : public std::exception {
   std::exception_ptr exp_;
   std::string err_str_;
@@ -61,8 +78,8 @@ struct EnforceNotMet : public std::exception {
 
       Dl_info info;
       for (int i = 0; i < size; ++i) {
-        if (dladdr(call_stack[i], &info)) {
-          auto demangled = info.dli_sname;
+        if (dladdr(call_stack[i], &info) && info.dli_sname) {
+          auto demangled = demangle(info.dli_sname);
           auto addr_offset = static_cast<char*>(call_stack[i]) -
                              static_cast<char*>(info.dli_saddr);
           sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,

From 39d79e64196049b6879612305bed604faac8a2dd Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Thu, 14 Sep 2017 14:20:33 -0700
Subject: [PATCH 11/17] modified codes

---
 paddle/framework/tensor.h       | 11 +------
 paddle/framework/tensor_test.cc |  8 ++---
 paddle/operators/CMakeLists.txt |  3 +-
 paddle/operators/cond_op.cc     | 55 ++++++++++++++++++++-------------
 paddle/operators/cond_op.h      | 17 ++++++++--
 5 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 20f019892b..4b5a2ae523 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -78,9 +78,6 @@ class Tensor {
   template <typename T>
   inline T* mutable_data(DDim dims, platform::Place place);
 
-  /*! Size of a single element in data() */
-  inline size_t element_size() { return holder_->element_size(); }
-
   /*! Return the dimensions of the memory block. */
   inline const DDim& dims() const;
 
@@ -132,7 +129,6 @@ class Tensor {
     virtual ~Placeholder() {}
     virtual void* ptr() const = 0;
     virtual size_t size() const = 0;
-    virtual size_t element_size() const = 0;
     virtual std::type_index type() const = 0;
     virtual platform::Place place() const = 0;
   };
@@ -143,8 +139,7 @@ class Tensor {
         : ptr_(static_cast<T*>(memory::Alloc(place, size)),
                memory::PODDeleter<T, Place>(place)),
           place_(place),
-          size_(size),
-          element_size_(sizeof(T)) {
+          size_(size) {
       PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                               (is_cpu_place(place_) ? "CPU" : "GPU"));
     }
@@ -153,7 +148,6 @@ class Tensor {
     virtual platform::Place place() const { return place_; }
     virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual std::type_index type() const { return std::type_index(typeid(T)); }
-    virtual size_t element_size() const { return element_size_; }
 
     /*! the pointer of memory block. */
     std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_;
@@ -163,9 +157,6 @@ class Tensor {
 
     /*! the size of memory block. */
     size_t size_;
-
-    /*! the size of a single element */
-    size_t element_size_;
   };
 
   /*! holds the memory block if allocated. */
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 8491536e6f..e2ec738de3 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -36,7 +36,7 @@ TEST(Tensor, DataAssert) {
   } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg =
-        "holder_ should not be null\nTenosr holds no memory. Call "
+        "holder_ should not be null\nTensor holds no memory. Call "
         "Tensor::mutable_data first.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
@@ -59,8 +59,6 @@ TEST(Tensor, MutableData) {
     // initialization
     p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
     EXPECT_NE(p1, nullptr);
-    // check tensor type
-    EXPECT_EQ(src_tensor.element_size(), sizeof(float));
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
     p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace());
@@ -114,7 +112,7 @@ TEST(Tensor, ShareDataWith) {
     } catch (paddle::platform::EnforceNotMet err) {
       caught = true;
       std::string msg =
-          "holder_ should not be null\nTenosr holds no memory. Call "
+          "holder_ should not be null\nTensor holds no memory. Call "
           "Tensor::mutable_data first.";
       const char* what = err.what();
       for (size_t i = 0; i < msg.length(); ++i) {
@@ -276,4 +274,4 @@ TEST(Tensor, ReshapeToMatrix) {
   Tensor res = ReshapeToMatrix<int>(src, 2);
   ASSERT_EQ(res.dims()[0], 2 * 3);
   ASSERT_EQ(res.dims()[1], 4 * 9);
-}
\ No newline at end of file
+}
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 4e83eea4ac..e3e934bccc 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -80,8 +80,7 @@ endfunction()
 add_subdirectory(math)
 
 set(DEPS_OPS
-    recurrent_op)
-set(DEPS_OPS
+    recurrent_op
     cond_op)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor net_op)
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index a3e4a2506f..b2e1ca395d 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -28,6 +28,7 @@ namespace operators {
 using Scope = framework::Scope;
 using Variable = framework::Variable;
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 using DDim = framework::DDim;
 
 void CondOp::CreateScope(const Scope& scope) const {
@@ -41,8 +42,9 @@ void CondOp::CreateScope(const Scope& scope) const {
 void CondOp::CreateIndexTensor(const Scope& scope) const {
   auto index_tensors_var = scope.FindVar("IndexTensors");
   PADDLE_ENFORCE(index_tensors_var != nullptr, "");
-  auto& index_tensors = *index_tensors_var->GetMutable<std::vector<Tensor>>();
-  index_tensors.push_back(Tensor());
+  auto& index_tensors =
+      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
+  index_tensors.push_back(LoDTensor());
 }
 
 void CondOp::InferShape(const Scope& scope) const {
@@ -65,8 +67,8 @@ void CondOp::InferShape(const Scope& scope) const {
     for (auto& input : Inputs("Xs")) {
       // Create a new tensor in sub-scope for input-type tensor
       Variable* v = sub_scopes[i]->NewVar(input);
-      Tensor* sub_input = v->GetMutable<Tensor>();
-      sub_input->Resize(scope.FindVar(input)->GetMutable<Tensor>()->dims());
+      LoDTensor* sub_input = v->GetMutable<LoDTensor>();
+      sub_input->Resize(scope.FindVar(input)->GetMutable<LoDTensor>()->dims());
     }
 
     for (auto& output : (*sub_net_op_[i]).Outputs()) {
@@ -80,33 +82,40 @@ void CondOp::InferShape(const Scope& scope) const {
   }
 
   for (auto& output : Outputs("Outs")) {
-    Tensor* tensor_t_out = sub_scopes[0]->FindVar(output)->GetMutable<Tensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should be NULL");
-    Tensor* tensor_f_out = sub_scopes[1]->FindVar(output)->GetMutable<Tensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "True output should be NULL");
+    LoDTensor* tensor_t_out =
+        sub_scopes[0]->FindVar(output)->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
+    LoDTensor* tensor_f_out =
+        sub_scopes[1]->FindVar(output)->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
 
     auto* tensor_out_var = scope.FindVar(output);
     PADDLE_ENFORCE_NOT_NULL(tensor_out_var, "Output not found");
-    Tensor* tensor_out = tensor_out_var->GetMutable<Tensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should be NULL");
+    LoDTensor* tensor_out = tensor_out_var->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
+                            "True output tensor should not be NULL");
+
     // check output size should be same
     PADDLE_ENFORCE_EQ(tensor_t_out->dims(), tensor_f_out->dims(),
                       "Outputs not of the same shape");
     tensor_out->Resize(tensor_t_out->dims());
-    tensor_out->mutable_data<float>(tensor_out->dims(), platform::CPUPlace());
+    // tensor_out->mutable_data<float>(tensor_out->dims(),
+    // platform::CPUPlace());
+    tensor_out->mutable_data<float>(platform::CPUPlace());
   }
 }
 
 void CondOp::Run(const Scope& scope,
                  const platform::DeviceContext& dev_ctx) const {
-  auto sub_scopes = scope.FindVar("SubScopes")->Get<std::vector<Scope*>>();
-  auto index_tensors =
-      scope.FindVar("IndexTensors")->Get<std::vector<Tensor>>();
+  auto* sub_scopes_var = scope.FindVar("SubScopes");
+  auto sub_scopes = sub_scopes_var->Get<std::vector<Scope*>>();
+  auto* index_tensors_var = scope.FindVar("IndexTensors");
+  auto index_tensors = index_tensors_var->Get<std::vector<LoDTensor>>();
 
   std::string cond_name = Input("Cond");
   Variable* cond_var = scope.FindVar(cond_name);
   PADDLE_ENFORCE_NOT_NULL(cond_var);
-  const Tensor* cond = cond_var->GetMutable<Tensor>();
+  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
 
   // Step 1: get the true/false index at runtime
   // index_[0]: vector<int>, contains all index for cond[i] == true
@@ -139,11 +148,11 @@ void CondOp::Run(const Scope& scope,
       // find Tensor
       Variable* v = scope.FindVar(input);
       PADDLE_ENFORCE_NOT_NULL(v);
-      Tensor* tensor_parent = v->GetMutable<Tensor>();
+      LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
 
       v = sub_scopes[i]->FindVar(input);
       PADDLE_ENFORCE_NOT_NULL(v);
-      Tensor* tensor_child = v->GetMutable<Tensor>();
+      LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
 
       // Resize child
       DDim dim = tensor_child->dims();
@@ -157,7 +166,9 @@ void CondOp::Run(const Scope& scope,
   }
 
   // Step 3: run
-  for (int i = 0; i < 2; ++i) sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+  for (int i = 0; i < 2; ++i) {
+    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+  }
 
   // Step 4: merge output results
   for (int i = 0; i < 2; ++i) {
@@ -166,11 +177,11 @@ void CondOp::Run(const Scope& scope,
       // find Tensor
       Variable* v = scope.FindVar(output);
       PADDLE_ENFORCE_NOT_NULL(v);
-      Tensor* tensor_parent = v->GetMutable<Tensor>();
+      LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
 
       v = sub_scopes[i]->FindVar(output);
       PADDLE_ENFORCE_NOT_NULL(v);
-      Tensor* tensor_child = v->GetMutable<Tensor>();
+      LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
 
       ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i],
                            tensor_parent);
@@ -192,7 +203,9 @@ class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 Sample dependent Cond Operator:
-The equation is: Out[i] = subnet_t[i], if Cond[i] == true
+Given Cond[i] as a 1/0 vector to indicate true/false
+The equation is: 
+Out[i] = subnet_t[i], if Cond[i] == true
 Out[i] = subnet_t[i], if Cond[i] == false
 )DOC");
   }
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
index 27a6e9e3c3..001096d31a 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -24,6 +24,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+/*
+ * @brief CondOp is a dynamic if-else Operator
+ *
+ * It has a input tensor named cond indicating which netop each instance will
+ * run.
+ *
+ * if cond == 1, it will run true_net, which is a NetOp.
+ *
+ * if cond == 0, it will run false_net, which is another NetOp.
+ */
+
 class CondOp : public framework::OperatorBase {
  public:
   CondOp(const std::string& type, const framework::VariableNameMap& inputs,
@@ -45,18 +56,18 @@ class CondOp : public framework::OperatorBase {
 
   void CreateIndexTensor(const framework::Scope& scope) const;
 
-  /**
+  /*
    * InferShape must be called before Run.
    */
   void InferShape(const framework::Scope& scope) const override;
 
   // Set True Block
-  void set_truenet(std::unique_ptr<OperatorBase> net) {
+  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
     sub_net_op_[0] = std::move(net);
   }
 
   // Set False Block
-  void set_falsenet(std::unique_ptr<OperatorBase> net) {
+  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
     sub_net_op_[1] = std::move(net);
   }
 

From c557402855dca954aa65b20827d27f258d9106b2 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Thu, 14 Sep 2017 14:51:45 -0700
Subject: [PATCH 12/17] cond_op modify

---
 paddle/operators/cond_op.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
index 001096d31a..7eeec84996 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -61,12 +61,16 @@ class CondOp : public framework::OperatorBase {
    */
   void InferShape(const framework::Scope& scope) const override;
 
-  // Set True Block
+  /*
+   * Set True Block
+   */
   void set_truenet(std::unique_ptr<OperatorBase>&& net) {
     sub_net_op_[0] = std::move(net);
   }
 
-  // Set False Block
+  /*
+   * Set False Block
+   */
   void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
     sub_net_op_[1] = std::move(net);
   }

From 98c35729cee39adf86df565ebfd2aa7bf167d960 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Thu, 14 Sep 2017 15:33:24 -0700
Subject: [PATCH 13/17] remove empty line

---
 paddle/operators/cond_op.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
index 7eeec84996..b09e32331e 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -34,7 +34,6 @@ namespace operators {
  *
  * if cond == 0, it will run false_net, which is another NetOp.
  */
-
 class CondOp : public framework::OperatorBase {
  public:
   CondOp(const std::string& type, const framework::VariableNameMap& inputs,

From 0620b00eb6a9a3ed9311c6b3e1acc5b6a0cc289e Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 14 Sep 2017 15:46:25 -0700
Subject: [PATCH 14/17] Fix Clang compile error

---
 paddle/operators/cos_sim_op.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 318b63f370..bcf6f758ca 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -56,7 +56,7 @@ class CosSimKernel : public framework::OpKernel {
     x_norm.device(place) = x.square().sum(row_along).sqrt();
     y_norm.device(place) = y.square().sum(row_along).sqrt();
     if (rows_x == rows_y) {
-      auto xy = (x * y).sum(Eigen::array<int, 1>({1}));
+      auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
       z.device(place) = xy / x_norm / y_norm;
     } else {
       Eigen::DSizes<int, 2> bcast(rows_x, 1);
@@ -134,7 +134,7 @@ class CosSimGradKernel : public framework::OpKernel {
         out_grad_y->mutable_data<T>(context.GetPlace());
         auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
         auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast;
-        dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({0}));
+        dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}}));
       }
     }
   }

From 773cfe62903e36726c0138c5e139b48b535f54ed Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Fri, 15 Sep 2017 12:05:13 +0800
Subject: [PATCH 15/17]  Delete unused `USE_OP` in combination op, and unused
 include in elementwise_mul_op.h

---
 paddle/operators/elementwise_mul_op.h | 2 --
 paddle/operators/minus_op.cc          | 2 --
 2 files changed, 4 deletions(-)

diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index e9ed679179..6d58da580b 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -13,10 +13,8 @@
    limitations under the License. */
 
 #pragma once
-#include <iostream>
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 8a583f24ed..1eac8f133b 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -77,8 +77,6 @@ class MinusGradOp : public NetOp {
 }  // namespace operators
 }  // namespace paddle
 
-USE_OP(scale);
-USE_NO_KERNEL_OP(identity);
 namespace ops = paddle::operators;
 REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad,
             ops::MinusGradOp<float>);

From b0d9b68a5ffe08615414d412f464bc64c9f18497 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 14 Sep 2017 20:34:21 +0800
Subject: [PATCH 16/17] unify functions of mkldnn_fc and refine comments

---
 paddle/gserver/layers/MKLDNNConvLayer.cpp |   9 +-
 paddle/gserver/layers/MKLDNNFcLayer.cpp   | 277 ++++++++++++++--------
 paddle/gserver/layers/MKLDNNFcLayer.h     |  59 +++++
 paddle/math/MKLDNNMatrix.h                |  11 +-
 4 files changed, 251 insertions(+), 105 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index f8c06c5f86..9088744bee 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -285,10 +285,9 @@ void MKLDNNConvLayer::resetWgtBiasValue(
   wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc());
   VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
 
-  bias = nullptr;
-  if (biases_ && biases_->getW()) {
-    bias = MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc());
-  }
+  bias = (biases_ && biases_->getW())
+             ? MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc())
+             : nullptr;
 }
 
 void MKLDNNConvLayer::resetOutValue(
@@ -356,6 +355,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
 
 void MKLDNNConvLayer::resetBwdDataPD(
     std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
+  pd = nullptr;
   if (inputLayers_[0]->getOutput().grad == nullptr) {
     return;
   }
@@ -476,6 +476,7 @@ void MKLDNNConvLayer::resetWgtBiasGrad(
       << "primitive desc of weight grad and value should be equal";
   VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat();
 
+  bias = nullptr;
   if (biasVal_ == nullptr) {
     return;
   }
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index f70343251a..f60e221a6e 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -17,9 +17,6 @@ limitations under the License. */
 
 using namespace mkldnn;  // NOLINT
 typedef memory::format format;
-typedef inner_product_forward fc_fwd;
-typedef inner_product_backward_weights fc_bwdWgt;
-typedef inner_product_backward_data fc_bwdData;
 
 namespace paddle {
 
@@ -93,35 +90,88 @@ void MKLDNNFcLayer::reshape(
   printSizeInfo();
 }
 
-void MKLDNNFcLayer::resetFwd(std::vector<mkldnn::primitive>& pipeline,
+void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
                              MKLDNNMatrixPtr& in,
                              MKLDNNMatrixPtr& wgt,
                              MKLDNNMatrixPtr& bias,
                              MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-  bool hasBias = biases_ && biases_->getW();
-  const MatrixPtr& wgtVal = weight_->getW();
-  const MatrixPtr& biasVal = hasBias ? biases_->getW() : nullptr;
-  const MatrixPtr& outVal = output_.value;
+  resetFwdBuffers(in, wgt, bias, out);
+
+  resetFwdPD(fwdPD_, in, wgt, bias, out);
+
+  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
+
+  printValueFormatFlow();
+}
+
+void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
+                             MKLDNNMatrixPtr& in,
+                             MKLDNNMatrixPtr& wgt,
+                             MKLDNNMatrixPtr& bias,
+                             MKLDNNMatrixPtr& out) {
+  std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
+
+  resetBwdBuffers(in, wgt, bias, out);
+
+  resetBwdWgtPD(bwdWgtPD, wgt, bias, out);
+
+  resetBwdDataPD(bwdDataPD, in, out);
+
+  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+
+  printGradFormatFlow();
+}
+
+void MKLDNNFcLayer::updateInputData() {
+  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+}
 
+void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+
+  resetWgtBiasValue(wgt, bias);
+
+  resetOutValue(out);
+}
+
+void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
   if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& inVal = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(inVal);
+    const MatrixPtr& dnnIn = getInputValue(0);
+    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
     CHECK(in) << "Input should be MKLDNNMatrix";
   } else {
     CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& inVal = getInputValue(0, CPU_DEVICE);
+    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
     in = MKLDNNMatrix::create(
-        inVal, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
+        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
   }
   in->downSpatial();
+}
+
+void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
+                                      MKLDNNMatrixPtr& bias) {
   wgt = MKLDNNMatrix::create(
-      wgtVal, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
+      weight_->getW(), {oc_, ic_, ih_, iw_}, format::oihw, engine_);
   wgt->downSpatial();
-  bias = hasBias ? MKLDNNMatrix::create(biasVal, {oc_}, format::x, engine_)
-                 : nullptr;
-  out = MKLDNNMatrix::create(outVal, {bs_, oc_}, format::nc, engine_);
 
+  bias = (biases_ && biases_->getW())
+             ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
+             : nullptr;
+}
+
+void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
+  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
   // change original output value to mkldnn output value
   output_.value = std::dynamic_pointer_cast<Matrix>(out);
   if (!outputIsOnlyMKLDNN()) {
@@ -129,46 +179,59 @@ void MKLDNNFcLayer::resetFwd(std::vector<mkldnn::primitive>& pipeline,
     // just share point
     getOutput(CPU_DEVICE).value->setData(output_.value->getData());
   }
+}
 
-  // create forward handle
+void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                               MKLDNNMatrixPtr in,
+                               MKLDNNMatrixPtr wgt,
+                               MKLDNNMatrixPtr bias,
+                               MKLDNNMatrixPtr out) {
+  CHECK(in);
+  CHECK(wgt);
+  CHECK(out);
   prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
-                                                in->getMemoryDesc(),
-                                                wgt->getMemoryDesc(),
-                                                bias->getMemoryDesc(),
-                                                out->getMemoryDesc())
-                                 : fc_fwd::desc(pk,
-                                                in->getMemoryDesc(),
-                                                wgt->getMemoryDesc(),
-                                                out->getMemoryDesc());
-  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-  if (hasBias) {
-    fwd_.reset(new fc_fwd(fwdPD, *in, *wgt, *bias, *out));
+  fc_fwd::desc fwdDesc = bias != nullptr ? fc_fwd::desc(pk,
+                                                        in->getMemoryDesc(),
+                                                        wgt->getMemoryDesc(),
+                                                        bias->getMemoryDesc(),
+                                                        out->getMemoryDesc())
+                                         : fc_fwd::desc(pk,
+                                                        in->getMemoryDesc(),
+                                                        wgt->getMemoryDesc(),
+                                                        out->getMemoryDesc());
+  pd.reset(new fc_fwd::primitive_desc(fwdDesc, engine_));
+}
+
+void MKLDNNFcLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<fc_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+
+  if (bias) {
+    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
   } else {
-    fwd_.reset(new fc_fwd(fwdPD, *in, *wgt, *out));
+    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
   }
-  printValueFormatFlow();
 
   pipeline.push_back(*fwd_);
 }
 
-void MKLDNNFcLayer::resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                             MKLDNNMatrixPtr& in,
-                             MKLDNNMatrixPtr& wgt,
-                             MKLDNNMatrixPtr& bias,
-                             MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-  if (!needResetBwd_) {
-    return;
-  }
-  needResetBwd_ = false;
-  bool hasBias = biases_ && biases_->getWGrad();
+void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  resetOutGrad(out);
+
+  resetWgtBiasGrad(wgt, bias);
 
-  /// backward weight
-  CHECK(inVal_) << "Should have input value";
-  const MatrixPtr& wgtGrad = weight_->getWGrad();
-  const MatrixPtr& biasGrad = hasBias ? biases_->getWGrad() : nullptr;
+  resetInGrad(in);
+}
 
+void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
   // TODO(TJ): merge outgrad
   int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
   // for MKLDNN device:
@@ -178,66 +241,88 @@ void MKLDNNFcLayer::resetBwd(std::vector<mkldnn::primitive>& pipeline,
   // for CPU device:
   // fc do not need to convert from cpu device since output is always nc format
   // only need create from cpu device
-  const MatrixPtr& outGrad = getOutput(device).grad;
-  out = MKLDNNMatrix::create(outGrad, outVal_->getPrimitiveDesc());
-  wgt = MKLDNNMatrix::create(wgtGrad, wgtVal_->getPrimitiveDesc());
-  bias = hasBias ? MKLDNNMatrix::create(biasGrad, biasVal_->getPrimitiveDesc())
-                 : nullptr;
-
-  // create memory primitive desc
-  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
-                                      inVal_->getMemoryDesc(),
-                                      wgt->getMemoryDesc(),
-                                      out->getMemoryDesc());
-  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-  fc_bwdWgt::desc bwdWgtDesc = hasBias
-                                   ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                                     wgt->getMemoryDesc(),
-                                                     bias->getMemoryDesc(),
-                                                     out->getMemoryDesc())
-                                   : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                                     wgt->getMemoryDesc(),
-                                                     out->getMemoryDesc());
-  fc_bwdWgt::primitive_desc bwdWgtPD =
-      fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
-
-  if (hasBias) {
-    bwdWgt_.reset(new fc_bwdWgt(bwdWgtPD, *inVal_, *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new fc_bwdWgt(bwdWgtPD, *inVal_, *out, *wgt));
+  CHECK(outVal_);
+  out =
+      MKLDNNMatrix::create(getOutput(device).grad, outVal_->getPrimitiveDesc());
+}
+
+void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
+                                     MKLDNNMatrixPtr& bias) {
+  CHECK(wgtVal_);
+  wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+
+  bias = nullptr;
+  if (biasVal_ == nullptr) {
+    return;
   }
-  pipeline.push_back(*bwdWgt_);
+  bias =
+      MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
+}
 
-  /// backward data
+void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
+  in = nullptr;
   const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
   if (inGrad == nullptr) {
     return;
   }
-  if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
-    // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
-  } else {
-    in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
-  }
-
-  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
-      inVal_->getMemoryDesc(), wgt->getMemoryDesc(), out->getMemoryDesc());
-  fc_bwdData::primitive_desc bwdDataPD =
-      fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
+  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
+  CHECK(inVal_);
+  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+}
 
-  CHECK(wgtVal_) << "Should have weight memory";
-  bwdData_.reset(new fc_bwdData(bwdDataPD, *out, *wgtVal_, *in));
-  printGradFormatFlow();
-  pipeline.push_back(*bwdData_);
+void MKLDNNFcLayer::resetBwdWgtPD(
+    std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVal_);
+  fc_bwdWgt::desc bwdWgtDesc = bias ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                      wgt->getMemoryDesc(),
+                                                      bias->getMemoryDesc(),
+                                                      out->getMemoryDesc())
+                                    : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                      wgt->getMemoryDesc(),
+                                                      out->getMemoryDesc());
+  pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
 }
 
-void MKLDNNFcLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+void MKLDNNFcLayer::resetBwdDataPD(
+    std::shared_ptr<fc_bwdData::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(wgtVal_);
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
+      in->getMemoryDesc(), wgtVal_->getMemoryDesc(), out->getMemoryDesc());
+  pd.reset(new fc_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
 }
 
-void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
+void MKLDNNFcLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
+    std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+  CHECK(inVal_);
+  if (bias) {
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+
+  if (bwdDataPD == nullptr) {
+    return;
   }
+  CHECK(wgtVal_) << "Should have weight memory";
+  bwdData_.reset(new fc_bwdData(*bwdDataPD, *out, *wgtVal_, *in));
+  pipeline.push_back(*bwdData_);
 }
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index 3119f86349..c76878aafa 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -18,6 +18,9 @@ limitations under the License. */
 #include "mkldnn.hpp"
 
 namespace paddle {
+typedef mkldnn::inner_product_forward fc_fwd;
+typedef mkldnn::inner_product_backward_weights fc_bwdWgt;
+typedef mkldnn::inner_product_backward_data fc_bwdData;
 
 /**
  * @brief A subclass of MKLDNNLayer fc layer.
@@ -32,6 +35,9 @@ protected:
   // if has already init the weight
   bool hasInitedWgt_;
 
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<fc_fwd::primitive_desc> fwdPD_;
+
   // fc weight and bias
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
@@ -67,6 +73,59 @@ public:
   void convertWeightsFromPaddle() override;
 
   void convertWeightsToPaddle() override;
+
+protected:
+  /**
+   * Forward functions: reset buffers(input, output, weight and bias),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetInValue(MKLDNNMatrixPtr& in);
+  void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
+  void resetOutValue(MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr wgt,
+                  MKLDNNMatrixPtr bias,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * Backward functions: reset buffers(input, output, weight and bias),
+   *                     reset primitive descriptor for backward weight,
+   *                     reset primitive descriptor for backward data,
+   *                     reset pipeline.
+   */
+  void resetBwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetOutGrad(MKLDNNMatrixPtr& out);
+  void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
+  void resetInGrad(MKLDNNMatrixPtr& in);
+  void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
+                     MKLDNNMatrixPtr& wgt,
+                     MKLDNNMatrixPtr& bias,
+                     MKLDNNMatrixPtr& out);
+  void resetBwdDataPD(std::shared_ptr<fc_bwdData::primitive_desc>& pd,
+                      MKLDNNMatrixPtr& in,
+                      MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
+                        std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
 };
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 0aa130b4a0..c843115eb9 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -66,11 +66,12 @@ public:
   /**
    * Create reorder primitive.
    * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
-   * checkData: for whether to check the data handle of src and dst is the same.
-   *            if true, means check it and do not want support inplace reorder;
-   *            otherwise do not check data which means the created reorder
-   *            maybe inplace buffer and do not guarantee the logical is correct
-   *            since not all format or conversion support inplace.
+   * checkData: whether to check the data handle of src and dst.
+   *            if true, it will check the data and do not allow them equal;
+   *            otherwise, it will not check them, then the reorder created
+   *            may have inplace buffer.
+   *            Do not set false, if you can not guarantee the inplace logical
+   *            would work with your reorder.
    */
   static std::shared_ptr<mkldnn::reorder> createReorder(
       const MKLDNNMatrixPtr& src,

From eef1ccbf08605f6fb784a472540ba9c1cc959a67 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Fri, 15 Sep 2017 08:18:28 +0000
Subject: [PATCH 17/17] Add the check of inputs and outputs in all operators.

---
 paddle/operators/accuracy_op.cc               | 11 +++++++---
 paddle/operators/add_op.cc                    |  7 +++++++
 paddle/operators/concat_op.cc                 |  3 +++
 paddle/operators/cond_op.cc                   | 21 ++++++++++++++-----
 paddle/operators/cos_sim_op.cc                | 12 +++++++++--
 paddle/operators/elementwise_mul_op.cc        | 10 +++++++--
 paddle/operators/fill_zeros_like_op.cc        |  7 +++++++
 paddle/operators/gather_op.cc                 |  7 +++++++
 paddle/operators/gaussian_random_op.cc        |  8 +++++--
 paddle/operators/identity_op.cc               |  5 +++++
 paddle/operators/lookup_table_op.cc           | 15 +++++++++----
 paddle/operators/mean_op.cc                   |  4 +++-
 paddle/operators/minus_op.cc                  |  7 +++++++
 paddle/operators/mul_op.cc                    |  7 +++++++
 paddle/operators/onehot_cross_entropy_op.cc   | 10 +++++++++
 paddle/operators/pad_op.cc                    |  5 +++++
 paddle/operators/reshape_op.cc                |  6 +++++-
 paddle/operators/rowwise_add_op.cc            |  7 +++++++
 paddle/operators/scale_op.cc                  |  5 +++++
 paddle/operators/scatter_op.cc                |  9 ++++++++
 paddle/operators/sequence_avg_pool_op.cc      |  9 +++++---
 paddle/operators/sgd_op.cc                    |  7 +++++++
 paddle/operators/sigmoid_op.cc                |  5 +++++
 paddle/operators/softmax_op.cc                |  5 +++++
 paddle/operators/squared_l2_distance_op.cc    | 18 ++++++++++------
 paddle/operators/sum_op.cc                    |  5 +++++
 paddle/operators/top_k_op.cc                  |  7 ++++++-
 paddle/operators/uniform_random_op.cc         |  4 ++++
 .../{test_add_two_op.py => test_add_op.py}    |  0
 .../tests/test_gaussian_random_op.py          |  2 +-
 .../v2/framework/tests/test_identity_op.py    | 20 ++++++++++++++++++
 ...ookup_table.py => test_lookup_table_op.py} |  0
 .../v2/framework/tests/test_minus_op.py       |  2 +-
 ..._op.py => test_onehot_cross_entropy_op.py} |  2 +-
 ...le_and_identity_op.py => test_scale_op.py} | 15 +------------
 .../paddle/v2/framework/tests/test_sgd_op.py  |  2 +-
 .../v2/framework/tests/test_sigmoid_op.py     |  2 +-
 .../v2/framework/tests/test_top_k_op.py       |  6 ++++++
 .../framework/tests/test_uniform_random_op.py |  2 +-
 39 files changed, 229 insertions(+), 50 deletions(-)
 rename python/paddle/v2/framework/tests/{test_add_two_op.py => test_add_op.py} (100%)
 create mode 100644 python/paddle/v2/framework/tests/test_identity_op.py
 rename python/paddle/v2/framework/tests/{test_lookup_table.py => test_lookup_table_op.py} (100%)
 rename python/paddle/v2/framework/tests/{test_cross_entropy_op.py => test_onehot_cross_entropy_op.py} (95%)
 rename python/paddle/v2/framework/tests/{test_scale_and_identity_op.py => test_scale_op.py} (56%)

diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 4a6c6381b0..0c813748b2 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -23,10 +23,15 @@ class AccuracyOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Inference"),
-                            "Input of Inference must be initialized.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("Inference"),
+        "Input(Inference) of AccuracyOp should not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input of Inference must be initialized.");
+                            "Input(Label) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Accuracy"),
+        "Output(Accuracy) of AccuracyOp should not be null.");
+
     auto *inference = ctx.Input<framework::Tensor>("Inference");
     auto *label = ctx.Input<framework::Tensor>("Label");
 
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index b43c09d4f0..e83c1efeaf 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -23,6 +23,13 @@ class AddOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of AddOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of AddOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of AddOp should not be null.");
+
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
                       ctx.Input<Tensor>("Y")->dims(),
                       "Two input of Add Op's dimension must be same.");
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 72fd179354..223bb0ffe6 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -25,6 +25,9 @@ class ConcatOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of ConcatOp should not be null.");
+
     auto ins = ctx.MultiInput<framework::Tensor>("X");
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index b2e1ca395d..8262a7a5c8 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -33,7 +33,8 @@ using DDim = framework::DDim;
 
 void CondOp::CreateScope(const Scope& scope) const {
   auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE(sub_scopes_var != nullptr, "");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
   auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
   auto& sub_scope = scope.NewScope();
   sub_scopes->push_back(&sub_scope);
@@ -41,7 +42,8 @@ void CondOp::CreateScope(const Scope& scope) const {
 
 void CondOp::CreateIndexTensor(const Scope& scope) const {
   auto index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE(index_tensors_var != nullptr, "");
+  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
+                          "Output(IndexTensors) of CondOp should not be null.");
   auto& index_tensors =
       *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
   index_tensors.push_back(LoDTensor());
@@ -49,7 +51,8 @@ void CondOp::CreateIndexTensor(const Scope& scope) const {
 
 void CondOp::InferShape(const Scope& scope) const {
   auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var);
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
   auto& sub_scopes = *sub_scopes_var->GetMutable<std::vector<Scope*>>();
 
   for (int i = 0; i < 2; ++i) {
@@ -63,7 +66,8 @@ void CondOp::InferShape(const Scope& scope) const {
     // branch
     CreateIndexTensor(scope);
 
-    PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs can't be empty");
+    PADDLE_ENFORCE(!Inputs("Xs").empty(),
+                   "Inputs(Xs) of CondOp can't be empty.");
     for (auto& input : Inputs("Xs")) {
       // Create a new tensor in sub-scope for input-type tensor
       Variable* v = sub_scopes[i]->NewVar(input);
@@ -108,13 +112,18 @@ void CondOp::InferShape(const Scope& scope) const {
 void CondOp::Run(const Scope& scope,
                  const platform::DeviceContext& dev_ctx) const {
   auto* sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
   auto sub_scopes = sub_scopes_var->Get<std::vector<Scope*>>();
   auto* index_tensors_var = scope.FindVar("IndexTensors");
+  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
+                          "Output(IndexTensors) of CondOp should not be null.");
   auto index_tensors = index_tensors_var->Get<std::vector<LoDTensor>>();
 
   std::string cond_name = Input("Cond");
   Variable* cond_var = scope.FindVar(cond_name);
-  PADDLE_ENFORCE_NOT_NULL(cond_var);
+  PADDLE_ENFORCE_NOT_NULL(cond_var,
+                          "Input(Cond) of CondOp should not be null.");
   const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
 
   // Step 1: get the true/false index at runtime
@@ -171,6 +180,8 @@ void CondOp::Run(const Scope& scope,
   }
 
   // Step 4: merge output results
+  PADDLE_ENFORCE(!Outputs("Outs").empty(),
+                 "Outputs(Outs) of CondOp can't be empty.");
   for (int i = 0; i < 2; ++i) {
     // i= 0/i for True and False branches respectively
     for (auto& output : Outputs("Outs")) {
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 253b17d8a1..72c4464936 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -26,8 +26,16 @@ class CosSimOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     // notnull check
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of CosSimOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of CosSimOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of CosSimOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("XNorm"),
+                            "Output(XNorm) of CosSimOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("YNorm"),
+                            "Output(YNorm) of CosSimOp should not be null.");
 
     // shape check
     auto x_dims = ctx.Input<Tensor>("X")->dims();
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index e37c582adb..ee6e975b44 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -25,8 +25,14 @@ class ElementWiseMulOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of ElementWiseMulOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of ElementWiseMulOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of ElementWiseMulOp should not be null.");
+
     auto x_dim = ctx.Input<Tensor>("X")->dims();
     auto y_dim = ctx.Input<Tensor>("Y")->dims();
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index 0c9734892a..ba7857cc65 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -23,6 +23,13 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("Src"),
+        "Input(Src) of FillZerosLikeOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Dst"),
+        "Output(Dst) of FillZerosLikeOp should not be null.");
+
     ctx.Output<framework::LoDTensor>("Dst")->Resize(
         ctx.Input<framework::Tensor>("Src")->dims());
   }
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index 8883d6d5fe..d445b61c16 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -24,6 +24,13 @@ class GatherOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of GatherOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Index"),
+                            "Input(Index) of GatherOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of GatherOp should not be null.");
+
     int batch_size = ctx.Input<Tensor>("Index")->dims()[0];
     PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 25b0776a37..c0e161bbc0 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -43,8 +43,12 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& context) const override {
-    auto* tensor = context.Output<framework::LoDTensor>("Out");
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of GaussianRandomOp should not be null.");
+
+    auto* tensor = ctx.Output<framework::LoDTensor>("Out");
     auto dims = Attr<std::vector<int>>("dims");
     std::vector<int64_t> temp;
     temp.reserve(dims.size());
diff --git a/paddle/operators/identity_op.cc b/paddle/operators/identity_op.cc
index 7d9d4fa519..b67ca5f6f8 100644
--- a/paddle/operators/identity_op.cc
+++ b/paddle/operators/identity_op.cc
@@ -42,6 +42,11 @@ class IdentityOp : public NetOp {
              const framework::VariableNameMap &outputs,
              const framework::AttributeMap &attrs)
       : NetOp(type, inputs, outputs, attrs) {
+    PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
+                      "Input(X) of IdentityOp should not be null.");
+    PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
+                      "Output(Out) of IdentityOp should not be null.");
+
     AppendOp(framework::OpRegistry::CreateOp(
         "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
         {{"scale", static_cast<AttrType>(1)}}));
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index b3d15f1ec9..07f6dfabca 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -22,10 +22,17 @@ class LookupTableOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &context) const override {
-    auto table_t = context.Input<Tensor>("W");
-    auto ids_t = context.Input<Tensor>("Ids");
-    auto output_t = context.Output<framework::LoDTensor>("Out");
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("W"),
+                            "Input(W) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Ids"),
+                            "Input(Ids) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of LookupTableOp should not be null.");
+
+    auto table_t = ctx.Input<Tensor>("W");
+    auto ids_t = ctx.Input<Tensor>("Ids");
+    auto output_t = ctx.Output<framework::LoDTensor>("Out");
 
     output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
   }
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 3e523d31b6..7d7eeb59a2 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -24,7 +24,9 @@ class MeanOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input of MeanOp must be initialized.");
+                            "Input(X) of MeanOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of MeanOp should not be null.");
     ctx.Output<framework::LoDTensor>("Out")->Resize({1});
   }
 };
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 8a583f24ed..61fe49ce32 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -27,6 +27,13 @@ class MinusOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of MinusOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of MinusOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of MinusOp should not be null.");
+
     auto *left_tensor = ctx.Input<framework::Tensor>("X");
     auto *right_tensor = ctx.Input<framework::Tensor>("Y");
 
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 015e13de9a..b6d320b415 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -26,6 +26,13 @@ class MulOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of MulOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of MulOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of MulOp should not be null.");
+
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto y_dims = ctx.Input<Tensor>("Y")->dims();
     int x_num_col_dims = Attr<int>("x_num_col_dims");
diff --git a/paddle/operators/onehot_cross_entropy_op.cc b/paddle/operators/onehot_cross_entropy_op.cc
index a9baada1cd..f38be3549f 100644
--- a/paddle/operators/onehot_cross_entropy_op.cc
+++ b/paddle/operators/onehot_cross_entropy_op.cc
@@ -23,6 +23,16 @@ class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("X"),
+        "Input(X) of OnehotCrossEntropyOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("label"),
+        "Input(label) of OnehotCrossEntropyOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Y"),
+        "Output(Y) of OnehotCrossEntropyOp should not be null.");
+
     auto *X = ctx.Input<Tensor>("X");
     auto *label = ctx.Input<Tensor>("label");
 
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 6cf7bd6f35..a0b1c6b631 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -25,6 +25,11 @@ class PadOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of PadOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of PadOp should not be null.");
+
     auto x_dim = ctx.Input<Tensor>("X")->dims();
     auto paddings = Attr<std::vector<int>>("paddings");
     PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index d281702092..0d05e34414 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -28,7 +28,11 @@ class ReshapeOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     // input check
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) shouldn't be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of ReshapeOp should not be null.");
+
     auto shape = ctx.Attr<std::vector<int>>("shape");
     PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
     for (auto dim : shape) {
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index c6101685a3..2a3fd3be94 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -25,6 +25,13 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of RowwiseAddOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("b"),
+                            "Input(b) of RowwiseAddOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of RowwiseAddOp should not be null.");
+
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto b_dims = ctx.Input<Tensor>("b")->dims();
     PADDLE_ENFORCE_GT(
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 35e6b70ba9..d1f42e8662 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -27,6 +27,11 @@ class ScaleOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of ScaleOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of ScaleOp should not be null.");
+
     auto *in = ctx.Input<framework::Tensor>("X");
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     out->Resize(in->dims());
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 0f7510983e..8820262732 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -24,6 +24,15 @@ class ScatterOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Ref"),
+                            "Input(Ref) of ScatterOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Index"),
+                            "Input(Index) of ScatterOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Updates"),
+                            "Input(Updates) of ScatterOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of ScatterOp should not be null.");
+
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Index")->dims().size(), 1,
                       "Update Index should be 1-D.");
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Ref")->dims().size(),
diff --git a/paddle/operators/sequence_avg_pool_op.cc b/paddle/operators/sequence_avg_pool_op.cc
index c15a5833de..eb3e37655b 100644
--- a/paddle/operators/sequence_avg_pool_op.cc
+++ b/paddle/operators/sequence_avg_pool_op.cc
@@ -23,9 +23,12 @@ class SequenceAvgPoolOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input of SequenceAvgPoolOp"
-                            "must be initialized.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("X"), "Input(X) of SequenceAvgPoolOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of SequenceAvgPoolOp should not be null.");
+
     auto* x = ctx.Input<framework::LoDTensor>("X");
     auto dims = x->dims();
     auto lod = x->lod();
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 7997bf6907..1232e64c7f 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -23,6 +23,13 @@ class SGDOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("param"),
+                            "Input(param) of SGDOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("grad"),
+                            "Input(grad) of SGDOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("param_out"),
+                            "Output(param_out) of SGDOp should not be null.");
+
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("param")->dims(),
                       ctx.Input<Tensor>("grad")->dims(),
                       "Two input of SGD Op's dimension must be same.");
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index de6a1ba773..992b19965e 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -23,6 +23,11 @@ class SigmoidOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of SigmoidOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
+                            "Output(Y) of SigmoidOp should not be null.");
+
     ctx.Output<framework::LoDTensor>("Y")->Resize(
         ctx.Input<Tensor>("X")->dims());
   }
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 239d3d141e..c67eb028c8 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -23,6 +23,11 @@ class SoftmaxOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of SoftmaxOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
+                            "Output(Y) of SoftmaxOp should not be null.");
+
     PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
                    "The input of softmax op must be a matrix.");
     ctx.Output<framework::LoDTensor>("Y")->Resize(
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index ebe5bd352e..39f4305877 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -23,12 +23,18 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input of SquaredL2DistanceOp "
-                            "must be initialized.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                            "Target of SquaredL2DistanceOp "
-                            "must be initialized.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("X"),
+        "Input(X) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("Y"),
+        "Input(Y) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("sub_result"),
+        "Output(sub_result) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of SquaredL2DistanceOp should not be null.");
 
     auto* x = ctx.Input<Tensor>("X");
     auto x_dims = x->dims();
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 7170e7256c..41e05c27f9 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -22,6 +22,11 @@ class SumOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(!ctx.MultiInputVar("X").empty(),
+                   "Input(X) of SumOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of SumOp should not be null.");
+
     auto ins = ctx.MultiInput<framework::Tensor>("X");
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     int N = ins.size();
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index ff0e77a344..169b815fef 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -24,7 +24,12 @@ class TopkOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input of TopkOP must be initialized.");
+                            "Input(X) of TopkOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of TopkOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Indices"),
+                            "Output(Indices) of TopkOp should not be null.");
+
     auto *input = ctx.Input<framework::Tensor>("X");
     const int k = static_cast<int>(ctx.Attr<int>("k"));
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index ed79736936..184bcbc29c 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -48,6 +48,10 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of UniformRandomOp should not be null.");
+
     PADDLE_ENFORCE(Attr<float>("min") < Attr<float>("max"),
                    "uniform_random's min must less then max");
     auto* tensor = ctx.Output<framework::LoDTensor>("Out");
diff --git a/python/paddle/v2/framework/tests/test_add_two_op.py b/python/paddle/v2/framework/tests/test_add_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_add_two_op.py
rename to python/paddle/v2/framework/tests/test_add_op.py
diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
index 1f9e4db783..1888ee28f9 100644
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -4,7 +4,7 @@ from paddle.v2.framework.op import Operator
 import numpy
 
 
-class GaussianRandomTest(unittest.TestCase):
+class TestGaussianRandomOp(unittest.TestCase):
     def test_cpu(self):
         self.gaussian_random_test(place=core.CPUPlace())
 
diff --git a/python/paddle/v2/framework/tests/test_identity_op.py b/python/paddle/v2/framework/tests/test_identity_op.py
new file mode 100644
index 0000000000..2e95e7c786
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_identity_op.py
@@ -0,0 +1,20 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestIdentityOp(OpTest):
+    def setUp(self):
+        self.op_type = "identity"
+        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X']}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lookup_table.py b/python/paddle/v2/framework/tests/test_lookup_table_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lookup_table.py
rename to python/paddle/v2/framework/tests/test_lookup_table_op.py
diff --git a/python/paddle/v2/framework/tests/test_minus_op.py b/python/paddle/v2/framework/tests/test_minus_op.py
index dea797a1fe..c56d7cb548 100644
--- a/python/paddle/v2/framework/tests/test_minus_op.py
+++ b/python/paddle/v2/framework/tests/test_minus_op.py
@@ -3,7 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-class MinusOpTest(OpTest):
+class TestMinusOp(OpTest):
     def setUp(self):
         self.op_type = "minus"
         self.inputs = {
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_onehot_cross_entropy_op.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_cross_entropy_op.py
rename to python/paddle/v2/framework/tests/test_onehot_cross_entropy_op.py
index 253e7b8a24..fd3cbdb803 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_onehot_cross_entropy_op.py
@@ -3,7 +3,7 @@ import numpy
 from op_test import OpTest
 
 
-class TestCrossEntropy(OpTest):
+class TestOnehotCrossEntropyOp(OpTest):
     def setUp(self):
         self.op_type = "onehot_cross_entropy"
         batch_size = 30
diff --git a/python/paddle/v2/framework/tests/test_scale_and_identity_op.py b/python/paddle/v2/framework/tests/test_scale_op.py
similarity index 56%
rename from python/paddle/v2/framework/tests/test_scale_and_identity_op.py
rename to python/paddle/v2/framework/tests/test_scale_op.py
index 05d76d4282..2ea1e18547 100644
--- a/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
+++ b/python/paddle/v2/framework/tests/test_scale_op.py
@@ -3,20 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-class IdentityTest(OpTest):
-    def setUp(self):
-        self.op_type = "identity"
-        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
-        self.outputs = {'Out': self.inputs['X']}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class ScaleTest(OpTest):
+class TestScaleOp(OpTest):
     def setUp(self):
         self.op_type = "scale"
         self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py
index 557cf15ace..64e54d1500 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -3,7 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestSGD(OpTest):
+class TestSGDOp(OpTest):
     def setUp(self):
         self.op_type = "sgd"
         w = np.random.random((102, 105)).astype("float32")
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_op.py b/python/paddle/v2/framework/tests/test_sigmoid_op.py
index 2316e49eff..d65d887db4 100644
--- a/python/paddle/v2/framework/tests/test_sigmoid_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
@@ -3,7 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestSigmoid(OpTest):
+class TestSigmoidOp(OpTest):
     def setUp(self):
         self.op_type = "sigmoid"
         self.inputs = {
diff --git a/python/paddle/v2/framework/tests/test_top_k_op.py b/python/paddle/v2/framework/tests/test_top_k_op.py
index cab799256d..694f37d612 100644
--- a/python/paddle/v2/framework/tests/test_top_k_op.py
+++ b/python/paddle/v2/framework/tests/test_top_k_op.py
@@ -21,6 +21,9 @@ class TestTopkOp(OpTest):
 
         self.outputs = {'Out': output, 'Indices': indices}
 
+    def test_check_output(self):
+        self.check_output()
+
 
 class TestTopkOp3d(OpTest):
     def setUp(self):
@@ -42,6 +45,9 @@ class TestTopkOp3d(OpTest):
 
         self.outputs = {'Out': output, 'Indices': indices}
 
+    def test_check_output(self):
+        self.check_output()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/framework/tests/test_uniform_random_op.py
index 76a5e36e56..9e8898fb59 100644
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
@@ -4,7 +4,7 @@ import paddle.v2.framework.core as core
 import numpy
 
 
-class UniformRandomTest(unittest.TestCase):
+class TestUniformRandomOp(unittest.TestCase):
     def test_uniform_random_cpu(self):
         self.uniform_random_test(place=core.CPUPlace())