From 8fd845e0fab40acc9539c4109feb3ed411f4dc8b Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 30 Sep 2017 16:55:40 -0700
Subject: [PATCH 01/15] Unify Map in OpDescBind

---
 paddle/framework/op_desc.cc | 27 ++++++++++++++++++++++++++-
 paddle/framework/op_desc.h  | 37 ++++++-------------------------------
 paddle/platform/enforce.h   |  4 ++--
 3 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 0c12c55dc0..33a064890c 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -112,6 +112,30 @@ const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
   return attrs_;
 }
 
+struct SetAttrDescVisitor : public boost::static_visitor<void> {
+  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
+  mutable OpDesc::Attr *attr_;
+  void operator()(int v) const { attr_->set_i(v); }
+  void operator()(float v) const { attr_->set_f(v); }
+  void operator()(const std::string &v) const { attr_->set_s(v); }
+  void operator()(bool b) const { attr_->set_b(b); }
+
+  void operator()(const std::vector<int> &v) const {
+    VectorToRepeated(v, attr_->mutable_ints());
+  }
+  void operator()(const std::vector<float> &v) const {
+    VectorToRepeated(v, attr_->mutable_floats());
+  }
+  void operator()(const std::vector<std::string> &v) const {
+    VectorToRepeated(v, attr_->mutable_strings());
+  }
+  void operator()(const std::vector<bool> &v) const {
+    VectorToRepeated(v, attr_->mutable_bools());
+  }
+  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+};
+
 void OpDescBind::Sync() {
   if (need_update_) {
     this->op_desc_.mutable_inputs()->Clear();
@@ -134,7 +158,8 @@ void OpDescBind::Sync() {
       attr_desc->set_name(attr.first);
       attr_desc->set_type(
           static_cast<framework::AttrType>(attr.second.which() - 1));
-      boost::apply_visitor(SetAttrDescVisitor(attr_desc), attr.second);
+      SetAttrDescVisitor visitor(attr_desc);
+      boost::apply_visitor(visitor, attr.second);
     }
 
     need_update_ = false;
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index 0cf7d13971..e03b4d067f 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/op_info.h"
 #include "paddle/framework/var_desc.h"
 
 namespace paddle {
@@ -61,48 +62,22 @@ class OpDescBind {
   void SetBlockAttr(const std::string &name, BlockDescBind &block);
 
   // Only be used in C++
-  void SetAttrMap(const std::unordered_map<std::string, Attribute> &attr_map);
+  void SetAttrMap(const AttributeMap &attr_map);
 
   Attribute GetAttr(const std::string &name) const;
 
   int GetBlockAttr(const std::string &name) const;
 
   // Only be used in C++
-  const std::unordered_map<std::string, Attribute> &GetAttrMap() const;
+  const AttributeMap &GetAttrMap() const;
 
  private:
-  struct SetAttrDescVisitor : public boost::static_visitor<void> {
-    explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-    mutable OpDesc::Attr *attr_;
-    void operator()(int v) const { attr_->set_i(v); }
-    void operator()(float v) const { attr_->set_f(v); }
-    void operator()(const std::string &v) const { attr_->set_s(v); }
-    void operator()(bool b) const { attr_->set_b(b); }
-
-    void operator()(const std::vector<int> &v) const {
-      VectorToRepeated(v, attr_->mutable_ints());
-    }
-    void operator()(const std::vector<float> &v) const {
-      VectorToRepeated(v, attr_->mutable_floats());
-    }
-    void operator()(const std::vector<std::string> &v) const {
-      VectorToRepeated(v, attr_->mutable_strings());
-    }
-    void operator()(const std::vector<bool> &v) const {
-      VectorToRepeated(v, attr_->mutable_bools());
-    }
-    void operator()(BlockDesc *desc) const {
-      attr_->set_block_idx(desc->idx());
-    }
-    void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
-  };
-
   void Sync();
 
   OpDesc op_desc_;
-  std::unordered_map<std::string, std::vector<std::string>> inputs_;
-  std::unordered_map<std::string, std::vector<std::string>> outputs_;
-  std::unordered_map<std::string, Attribute> attrs_;
+  VariableNameMap inputs_;
+  VariableNameMap outputs_;
+  AttributeMap attrs_;
 
   // need_update_ indicate there some local changes not be synchronized. If
   // local changes should be synchronized, need_update_ should be set to true.
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index b523ef03c0..52bd23039b 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -185,7 +185,7 @@ inline void throw_on_error(T e) {
         std::make_exception_ptr(                                       \
             std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
         __FILE__, __LINE__);                                           \
-  } while (0)
+  } while (false)
 
 #define PADDLE_ENFORCE(...)                                             \
   do {                                                                  \
@@ -195,7 +195,7 @@ inline void throw_on_error(T e) {
       throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
                                               __FILE__, __LINE__);      \
     }                                                                   \
-  } while (0)
+  } while (false)
 
 /*
  * Some enforce helpers here, usage:

From 7163dd0413d5b99261ff95e0fab28a09f8abb74a Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sun, 1 Oct 2017 09:27:44 -0700
Subject: [PATCH 02/15] revert code

---
 paddle/operators/recurrent_op.cc              | 41 +++++++++++++++++++
 paddle/operators/recurrent_op.h               | 19 +++++++++
 .../v2/framework/tests/test_recurrent_op.py   |  3 --
 3 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 80de229c33..b9fba3e135 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -28,6 +28,29 @@ using Variable = framework::Variable;
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
+void RecurrentAlgorithm::InferShape(const Scope& scope) const {
+  auto* input0 = scope.FindVar(arg_->inlinks[0]);
+  PADDLE_ENFORCE_NOT_NULL(input0);
+  seq_len_ = input0->GetMutable<LoDTensor>()->dims()[0];
+  PADDLE_ENFORCE_GT(seq_len_, 0);
+
+  CreateScopes(scope);
+  auto& step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
+                     true /*infer_shape_mode*/);
+  InitMemories(step_scopes[0], true /*infer_shape_mode*/);
+
+  for (size_t i = 0; i < seq_len_; i++) {
+    if (i > 0) {
+      rnn::LinkMemories(step_scopes, arg_->memories, i, -1,
+                        true /*infer_shape_mode*/);
+    }
+    (*stepnet_)->InferShape(*step_scopes[i]);
+  }
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
+                     true /*infer_shape_mode*/);
+}
+
 void RecurrentAlgorithm::Run(const Scope& scope,
                              const platform::DeviceContext& dev_ctx) const {
   auto step_scopes = GetStepScopes(scope);
@@ -179,6 +202,24 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
   }
 }
 
+void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
+  seq_len_ =
+      scope.FindVar(arg_->inlinks[0])->GetMutable<LoDTensor>()->dims()[0];
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
+                     true /*infer_shape_mode*/);
+  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
+    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
+                        true /*infer_shape_mode*/);
+    }
+    (*stepnet_)->InferShape(*step_scopes[step_id]);
+  }
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
+                     true /*infer_shape_mode*/);
+  LinkBootMemoryGradients(step_scopes[0], true /*infer_shape_mode*/);
+}
+
 RecurrentGradientOp::RecurrentGradientOp(
     const std::string& type, const framework::VariableNameMap& inputs,
     const framework::VariableNameMap& outputs,
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index c6b9a5533e..18f8c53e18 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -41,6 +41,11 @@ class RecurrentAlgorithm {
     stepnet_ = stepnet;
   }
 
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const framework::Scope& scope) const;
+
  protected:
   /*
    * The step scopes will be stored in the father scope as a variable.
@@ -89,6 +94,11 @@ class RecurrentGradientAlgorithm {
   void LinkBootMemoryGradients(framework::Scope* step_scopes,
                                bool infer_shape_mode) const;
 
+  /**
+ * InferShape must be called before Run.
+ */
+  void InferShape(const framework::Scope& scope) const;
+
  protected:
   inline const std::vector<framework::Scope*>& GetStepScopes(
       const framework::Scope& scope) const {
@@ -123,8 +133,13 @@ class RecurrentOp : public framework::OperatorBase {
   void set_stepnet(std::unique_ptr<OperatorBase> net) {
     stepnet_ = std::move(net);
   }
+
   const OperatorBase& stepnet() const { return *stepnet_; }
 
+  void InferShape(const framework::Scope& scope) const {
+    alg_.InferShape(scope);
+  }
+
   static const rnn::ArgumentName kArgName;
 
  private:
@@ -147,6 +162,10 @@ class RecurrentGradientOp : public framework::OperatorBase {
     PADDLE_THROW("Not Implemented");
   }
 
+  void InferShape(const framework::Scope& scope) const {
+    alg_.InferShape(scope);
+  }
+
   void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {
     alg_.Run(scope, dev_ctx);
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 92161ae5dd..6b9e7a88ce 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -197,7 +197,4 @@ class RecurrentGradientOpTest(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    exit(
-        0
-    )  # FIXME(yuyang18): InferShape has been removed, this unittest may error
     unittest.main()

From 5423cb3e57949fc2885e39016422bf92b70b5260 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 1 Oct 2017 09:54:08 -0700
Subject: [PATCH 03/15] format

---
 paddle/framework/block_desc.h   |  6 +++---
 paddle/framework/op_info.h      |  8 +++-----
 paddle/framework/program_desc.h |  6 +++---
 paddle/framework/scope.h        |  8 +++-----
 paddle/platform/macros.h        | 10 ++++++----
 5 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 1a1135bab4..59513ede33 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/var_desc.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -34,9 +35,6 @@ class BlockDescBind {
   BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
       : prog_(prog), desc_(desc), need_update_(false) {}
 
-  BlockDescBind(const BlockDescBind &o) = delete;
-  BlockDescBind &operator=(const BlockDescBind &o) = delete;
-
   int32_t ID() const { return desc_->idx(); }
 
   int32_t Parent() const { return desc_->parent_idx(); }
@@ -66,6 +64,8 @@ class BlockDescBind {
 
   std::deque<std::unique_ptr<OpDescBind>> ops_;
   std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
+
+  DISABLE_COPY_AND_ASSIGN(BlockDescBind);
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index 6d1ee4dece..5df3093318 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -19,6 +19,7 @@
 #include <unordered_map>
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/op_desc.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -72,11 +73,6 @@ class OpInfoMap {
  public:
   static OpInfoMap& Instance();
 
-  OpInfoMap(const OpInfoMap& o) = delete;
-  OpInfoMap(OpInfoMap&& o) = delete;
-  OpInfoMap& operator=(const OpInfoMap& o) = delete;
-  OpInfoMap& operator=(OpInfoMap&& o) = delete;
-
   bool Has(const std::string& op_type) const {
     return map_.find(op_type) != map_.end();
   }
@@ -112,6 +108,8 @@ class OpInfoMap {
  private:
   OpInfoMap() = default;
   std::unordered_map<std::string, const OpInfo> map_;
+
+  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
 };
 
 }  // namespace framework
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index 06ffcd4b15..9b34a06aef 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/framework/framework.pb.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -26,9 +27,6 @@ class ProgramDescBind {
  public:
   static ProgramDescBind &Instance(ProgramDesc *prog);
 
-  ProgramDescBind(const ProgramDescBind &o) = delete;
-  ProgramDescBind &operator=(const ProgramDescBind &o) = delete;
-
   BlockDescBind *AppendBlock(const BlockDescBind &parent);
 
   BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }
@@ -46,6 +44,8 @@ class ProgramDescBind {
   ProgramDesc *prog_;
 
   std::vector<std::unique_ptr<BlockDescBind>> blocks_;
+
+  DISABLE_COPY_AND_ASSIGN(ProgramDescBind);
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index c93b03e481..7047f0d55e 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "paddle/framework/variable.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -38,11 +39,6 @@ class Scope {
   Scope() {}
   ~Scope();
 
-  // Disable Copy, Assign, Move.
-  Scope(const Scope& other) = delete;
-  Scope& operator=(const Scope& other) = delete;
-  Scope(Scope&& other) = delete;
-
   /// Create a sub-scope. Returns a reference other than a pointer so
   /// to prevent from manual deletion.
   /// Mark it to const because that new kid scope cannot change parent scope.
@@ -73,6 +69,8 @@ class Scope {
   std::unordered_map<std::string, Variable*> vars_;
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
+
+  DISABLE_COPY_AND_ASSIGN(Scope);
 };
 
 }  // namespace framework
diff --git a/paddle/platform/macros.h b/paddle/platform/macros.h
index 4a04a38c0c..feae7bdd77 100644
--- a/paddle/platform/macros.h
+++ b/paddle/platform/macros.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname) \
- private:                                  \
-  classname(const classname&) = delete;    \
-  classname& operator=(const classname&) = delete
+#define DISABLE_COPY_AND_ASSIGN(classname)         \
+ private:                                          \
+  classname(const classname&) = delete;            \
+  classname(const classname&&) = delete;           \
+  classname& operator=(const classname&) = delete; \
+  classname& operator=(const classname&&) = delete
 #endif

From 2296d81cf9560437b368354229b7ceb22b67d234 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 2 Oct 2017 11:39:10 -0700
Subject: [PATCH 04/15] Use `type_defs.h` to resolve cyclic dependencies

---
 paddle/framework/attribute.h | 10 +---------
 paddle/framework/op_desc.h   |  2 +-
 paddle/framework/op_info.h   |  7 +------
 paddle/framework/type_defs.h | 38 ++++++++++++++++++++++++++++++++++++
 4 files changed, 41 insertions(+), 16 deletions(-)
 create mode 100644 paddle/framework/type_defs.h

diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index c7559cefb6..d13530e340 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -21,20 +21,12 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/type_defs.h"
 #include "paddle/platform/enforce.h"
-#include "paddle/platform/variant.h"
 
 namespace paddle {
 namespace framework {
 
-// The order should be as same as framework.proto
-typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                       std::vector<float>, std::vector<std::string>, bool,
-                       std::vector<bool>, BlockDesc*>
-    Attribute;
-
-typedef std::unordered_map<std::string, Attribute> AttributeMap;
-
 ProgramDesc& GetProgramDesc();
 
 template <typename T>
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index e03b4d067f..0af4169715 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/framework/attribute.h"
-#include "paddle/framework/op_info.h"
+#include "paddle/framework/type_defs.h"
 #include "paddle/framework/var_desc.h"
 
 namespace paddle {
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index 6d1ee4dece..470336d367 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -19,15 +19,10 @@
 #include <unordered_map>
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/op_desc.h"
+#include "paddle/framework/type_defs.h"
 
 namespace paddle {
 namespace framework {
-class OperatorBase;
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-
-using OpCreator = std::function<OperatorBase*(
-    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
-    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
 class GradOpDescMakerBase {
  public:
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
new file mode 100644
index 0000000000..dec5066f1e
--- /dev/null
+++ b/paddle/framework/type_defs.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include "paddle/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+class OperatorBase;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+// The order should be as same as framework.proto
+using Attribute =
+    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                   std::vector<float>, std::vector<std::string>, bool,
+                   std::vector<bool>, BlockDesc*>;
+
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+
+}  // namespace framework
+}  // namespace paddle

From 32f5c9dd934e7de15a93a8145bf6ee4499b3bc7d Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Oct 2017 11:51:24 -0700
Subject: [PATCH 05/15] recurrent_op pass the unit test

---
 paddle/operators/recurrent_op.cc              | 87 +++++--------------
 paddle/operators/recurrent_op.h               | 23 +----
 paddle/operators/rnn/recurrent_op_utils.cc    | 55 +++++-------
 paddle/operators/rnn/recurrent_op_utils.h     |  6 +-
 paddle/operators/sum_op.cc                    |  5 +-
 .../v2/framework/tests/test_recurrent_op.py   | 26 +++---
 6 files changed, 66 insertions(+), 136 deletions(-)

diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index b9fba3e135..016e2043fd 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -28,7 +28,8 @@ using Variable = framework::Variable;
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-void RecurrentAlgorithm::InferShape(const Scope& scope) const {
+void RecurrentAlgorithm::Run(const Scope& scope,
+                             const platform::DeviceContext& dev_ctx) const {
   auto* input0 = scope.FindVar(arg_->inlinks[0]);
   PADDLE_ENFORCE_NOT_NULL(input0);
   seq_len_ = input0->GetMutable<LoDTensor>()->dims()[0];
@@ -36,38 +37,16 @@ void RecurrentAlgorithm::InferShape(const Scope& scope) const {
 
   CreateScopes(scope);
   auto& step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-                     true /*infer_shape_mode*/);
-  InitMemories(step_scopes[0], true /*infer_shape_mode*/);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);
+  InitMemories(step_scopes[0]);
 
   for (size_t i = 0; i < seq_len_; i++) {
     if (i > 0) {
-      rnn::LinkMemories(step_scopes, arg_->memories, i, -1,
-                        true /*infer_shape_mode*/);
-    }
-    (*stepnet_)->InferShape(*step_scopes[i]);
-  }
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-                     true /*infer_shape_mode*/);
-}
-
-void RecurrentAlgorithm::Run(const Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
-  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-                     false /*infer_shape_mode*/);
-  InitMemories(step_scopes[0], false /*infer_shape_mode*/);
-
-  for (size_t step_id = 0; step_id < seq_len_; step_id++) {
-    // create output alias variables
-    if (step_id > 0) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1,
-                        false /*infer_shape_mode*/);
+      rnn::LinkMemories(step_scopes, arg_->memories, i, -1);
     }
-    (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
+    (*stepnet_)->Run(*step_scopes[i], dev_ctx);
   }
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-                     false /*infer_shape_mode*/);
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_);
 }
 
 void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
@@ -105,8 +84,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
   }
 }
 
-void RecurrentAlgorithm::InitMemories(Scope* step_scope,
-                                      bool infer_shape_mode) const {
+void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
   for (auto& attr : arg_->memories) {
     auto* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<LoDTensor>();
     PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
@@ -114,12 +92,9 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope,
                    attr.boot_var);
     auto* boot_mem =
         step_scope->FindVar(attr.boot_var)->GetMutable<LoDTensor>();
-    if (infer_shape_mode) {
-      pre_mem->Resize(boot_mem->dims());
-      PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
-    } else {
-      pre_mem->ShareDataWith<float>(*boot_mem);
-    }
+    pre_mem->Resize(boot_mem->dims());
+    PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
+    pre_mem->ShareDataWith<float>(*boot_mem);
   }
 }
 
@@ -169,23 +144,22 @@ class RecurrentAlgorithmProtoAndCheckerMaker
 
 void RecurrentGradientAlgorithm::Run(
     const Scope& scope, const platform::DeviceContext& dev_ctx) const {
+  seq_len_ =
+      scope.FindVar(arg_->inlinks[0])->GetMutable<LoDTensor>()->dims()[0];
   auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-                     false /*infer_shape_mode*/);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);
   for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
     if (static_cast<size_t>(step_id) != seq_len_ - 1) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
-                        false /*infer_shape_mode*/);
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
     }
     (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
-  LinkBootMemoryGradients(step_scopes[0], false);
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-                     false /*infer_shape_mode*/);
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_);
+  LinkBootMemoryGradients(step_scopes[0]);
 }
 
 void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
-    Scope* step_scope, bool infer_shape_mode) const {
+    Scope* step_scope) const {
   for (auto& attr : arg_->memories) {
     PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr,
                    "memory variable [%s] does not exists", attr.var);
@@ -194,30 +168,9 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
     auto* mem_grad = step_scope->NewVar(attr.var)->GetMutable<LoDTensor>();
     auto* boot_mem_grad =
         step_scope->NewVar(attr.boot_var)->GetMutable<LoDTensor>();
-    if (infer_shape_mode) {
-      boot_mem_grad->Resize(mem_grad->dims());
-    } else {
-      boot_mem_grad->ShareDataWith<float>(*mem_grad);
-    }
-  }
-}
-
-void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
-  seq_len_ =
-      scope.FindVar(arg_->inlinks[0])->GetMutable<LoDTensor>()->dims()[0];
-  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-                     true /*infer_shape_mode*/);
-  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
-    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
-                        true /*infer_shape_mode*/);
-    }
-    (*stepnet_)->InferShape(*step_scopes[step_id]);
+    boot_mem_grad->Resize(mem_grad->dims());
+    boot_mem_grad->ShareDataWith<float>(*mem_grad);
   }
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-                     true /*infer_shape_mode*/);
-  LinkBootMemoryGradients(step_scopes[0], true /*infer_shape_mode*/);
 }
 
 RecurrentGradientOp::RecurrentGradientOp(
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index 18f8c53e18..752025e42c 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -41,11 +41,6 @@ class RecurrentAlgorithm {
     stepnet_ = stepnet;
   }
 
-  /**
-   * InferShape must be called before Run.
-   */
-  void InferShape(const framework::Scope& scope) const;
-
  protected:
   /*
    * The step scopes will be stored in the father scope as a variable.
@@ -61,7 +56,7 @@ class RecurrentAlgorithm {
                 ->GetMutable<std::vector<framework::Scope*>>();
   }
 
-  void InitMemories(framework::Scope* step_scopes, bool infer_shape_mode) const;
+  void InitMemories(framework::Scope* step_scopes) const;
 
  private:
   std::unique_ptr<framework::OperatorBase>* stepnet_;
@@ -91,13 +86,7 @@ class RecurrentGradientAlgorithm {
   void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const;
 
-  void LinkBootMemoryGradients(framework::Scope* step_scopes,
-                               bool infer_shape_mode) const;
-
-  /**
- * InferShape must be called before Run.
- */
-  void InferShape(const framework::Scope& scope) const;
+  void LinkBootMemoryGradients(framework::Scope* step_scopes) const;
 
  protected:
   inline const std::vector<framework::Scope*>& GetStepScopes(
@@ -136,10 +125,6 @@ class RecurrentOp : public framework::OperatorBase {
 
   const OperatorBase& stepnet() const { return *stepnet_; }
 
-  void InferShape(const framework::Scope& scope) const {
-    alg_.InferShape(scope);
-  }
-
   static const rnn::ArgumentName kArgName;
 
  private:
@@ -162,10 +147,6 @@ class RecurrentGradientOp : public framework::OperatorBase {
     PADDLE_THROW("Not Implemented");
   }
 
-  void InferShape(const framework::Scope& scope) const {
-    alg_.InferShape(scope);
-  }
-
   void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {
     alg_.Run(scope, dev_ctx);
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index a767009d23..a02994f99d 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -25,7 +25,7 @@ using LoDTensor = framework::LoDTensor;
 
 void SegmentInputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& inlinks,
-                   const size_t seq_len, bool infer_shape_mode) {
+                   const size_t seq_len) {
   PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
   for (size_t i = 0; i < inlinks.size(); ++i) {
     // global inputs
@@ -41,11 +41,9 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
     for (size_t j = 0; j < seq_len; j++) {
       Tensor* step_input =
           step_scopes[j]->NewVar(inlinks[i])->GetMutable<Tensor>();
-      if (!infer_shape_mode) {
-        // The input of operators of each step is Tensor here.
-        // Maybe need to modify Slice function.
-        *step_input = input->Slice<float>(j, j + 1);
-      }
+      // The input of operators of each step is Tensor here.
+      // Maybe need to modify Slice function.
+      *step_input = input->Slice<float>(j, j + 1);
       step_input->Resize(step_dims);
     }
   }
@@ -53,39 +51,35 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
 
 void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& outlinks,
-                   const size_t seq_len, bool infer_shape_mode) {
+                   const size_t seq_len) {
   for (size_t i = 0; i < outlinks.size(); i++) {
     auto output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
     PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.",
                             outlinks[i]);
     LoDTensor* output = output_var->GetMutable<LoDTensor>();
 
-    if (infer_shape_mode) {
-      auto step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
-      PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
-      f::DDim step_dims =
-          step_scope_var->template GetMutable<LoDTensor>()->dims();
-      std::vector<int64_t> dims_vec = vectorize(step_dims);
-      dims_vec.insert(dims_vec.begin(), seq_len);
-      output->Resize(f::make_ddim(dims_vec));
-    } else {
-      output->mutable_data<float>(platform::CPUPlace());
-      for (size_t j = 0; j < seq_len; j++) {
-        LoDTensor* step_output =
-            step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
-        // TODO(luotao02) data type and platform::DeviceContext() should set
-        // correctly
-        (output->Slice<float>(j, j + 1))
-            .CopyFrom<float>(*step_output, platform::CPUPlace());
-      }
+    auto step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
+    PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
+    f::DDim step_dims =
+        step_scope_var->template GetMutable<LoDTensor>()->dims();
+    std::vector<int64_t> dims_vec = vectorize(step_dims);
+    dims_vec.insert(dims_vec.begin(), seq_len);
+    output->Resize(f::make_ddim(dims_vec));
+    output->mutable_data<float>(platform::CPUPlace());
+    for (size_t j = 0; j < seq_len; j++) {
+      LoDTensor* step_output =
+          step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
+      // TODO(luotao02) data type and platform::DeviceContext() should set
+      // correctly
+      (output->Slice<float>(j, j + 1))
+          .CopyFrom<float>(*step_output, platform::CPUPlace());
     }
   }
 }
 
 void LinkMemories(const std::vector<Scope*>& scopes,
                   const std::vector<rnn::MemoryAttr>& memories,
-                  const size_t step_id, const int offset,
-                  bool infer_shape_mode) {
+                  const size_t step_id, const int offset) {
   PADDLE_ENFORCE_LT(step_id, scopes.size(),
                     "step [%d] is out of range of step scopes' size [%d]",
                     step_id, scopes.size());
@@ -100,11 +94,8 @@ void LinkMemories(const std::vector<Scope*>& scopes,
   for (auto& attr : memories) {
     auto mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
     auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
-    if (infer_shape_mode) {
-      mem->Resize(linked_mem->dims());
-    } else {
-      mem->ShareDataWith<float>(*linked_mem);
-    }
+    mem->Resize(linked_mem->dims());
+    mem->ShareDataWith<float>(*linked_mem);
   }
 }
 
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
index 9c777f1e90..fd17b9b889 100644
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -64,18 +64,18 @@ struct ArgumentName {
  */
 void SegmentInputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& inlinks,
-                   const size_t seq_len, bool infer_shape_mode);
+                   const size_t seq_len);
 
 /**
  * Process outputs of step nets and merge to variables.
  */
 void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& outlinks,
-                   const size_t seq_len, bool infer_shape_mode);
+                   const size_t seq_len);
 
 void LinkMemories(const std::vector<Scope*>& step_scopes,
                   const std::vector<MemoryAttr>& memories, const size_t step_id,
-                  const int offset, bool infer_shape_mode);
+                  const int offset);
 
 void InitArgument(const ArgumentName& name, Argument* arg,
                   const framework::OperatorBase& op, bool is_grad = false);
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 5d76313aeb..c54843faa6 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -22,14 +22,15 @@ class SumOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
     auto x_dims = ctx->GetInputsDim("X");
-    PADDLE_ENFORCE(!x_dims.empty(), "Input(X) of SumOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SumOp should not be null.");
 
-    auto in_dim = x_dims[0];
     size_t N = x_dims.size();
     PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
+
+    auto in_dim = x_dims[0];
     for (size_t i = 1; i < N; i++) {
       auto dim = x_dims[i];
       PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 6b9e7a88ce..1f114432c0 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -16,14 +16,17 @@ class PySimpleRNN(object):
     '''
 
     def __init__(self, input_dim=30, batch_size=50, weight_dim=15, sent_len=11):
-        self.x = np.random.normal(size=(sent_len, batch_size, input_dim))
-        self.W = np.random.normal(size=(input_dim, input_dim))
-        self.U = np.random.normal(size=(input_dim, input_dim))
-        self.h_boot = np.random.normal(size=(batch_size, input_dim))
+        self.x = np.random.normal(size=(sent_len, batch_size,
+                                        input_dim)).astype("float32")
+        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.h_boot = np.random.normal(size=(batch_size,
+                                             input_dim)).astype("float32")
 
         # memories
         self.mems = [
-            np.zeros(shape=(batch_size, input_dim)) for i in range(sent_len)
+            np.zeros(shape=(batch_size, input_dim)).astype("float32")
+            for i in range(sent_len)
         ]
 
     def forward(self):
@@ -36,7 +39,7 @@ class PySimpleRNN(object):
         return [self.x[i] for i in range(self.x.shape[0])]
 
     def concat_outputs(self):
-        return np.array(self.mems)
+        return np.array(self.mems).astype("float32")
 
     def step(self, step_id, x):
         '''
@@ -47,8 +50,8 @@ class PySimpleRNN(object):
             pre_mem = self.mems[step_id - 1]
         else:
             pre_mem = self.h_boot
-        xW = np.matmul(x, self.W)
-        hU = np.matmul(pre_mem, self.U)
+        xW = np.matmul(x, self.W).astype("float32")
+        hU = np.matmul(pre_mem, self.U).astype("float32")
 
         sum = xW + hU
         self.mems[step_id] = py_sigmoid(sum)
@@ -102,7 +105,8 @@ class RecurrentOpTest(unittest.TestCase):
         self.create_step_net()
         ctx = core.DeviceContext.create(core.CPUPlace())
         self.rnnop.run(self.scope, ctx)
-        return np.array(self.scope.find_var("h@mem").get_tensor())
+        return np.array(self.scope.find_var("h@mem").get_tensor()).astype(
+            "float32")
 
     def create_global_variables(self):
         # create inlink
@@ -142,7 +146,7 @@ class RecurrentOpTest(unittest.TestCase):
         stepnet = core.Net.create()
         x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
         sig_op = Operator("sigmoid", X="sum", Y="h@mem")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
@@ -179,7 +183,7 @@ class RecurrentGradientOpTest(unittest.TestCase):
         stepnet = core.Net.create()
         x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
         sig_op = Operator("sigmoid", X="sum", Y="h@alias")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:

From c705f065ba403606d39bc972d85f7eba1920f029 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 2 Oct 2017 16:14:48 -0400
Subject: [PATCH 06/15] add TensorArray (#4459)

* add tensor array

* update

* set type
---
 paddle/framework/CMakeLists.txt       |   3 +
 paddle/framework/tensor_array.cc      | 283 ++++++++++++++++++++++++++
 paddle/framework/tensor_array.h       | 118 +++++++++++
 paddle/framework/tensor_array_test.cc | 130 ++++++++++++
 4 files changed, 534 insertions(+)
 create mode 100644 paddle/framework/tensor_array.cc
 create mode 100644 paddle/framework/tensor_array.h
 create mode 100644 paddle/framework/tensor_array_test.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 9140854a96..5d394132b7 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -43,3 +43,6 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
+
+cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
+cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc
new file mode 100644
index 0000000000..d54714c66c
--- /dev/null
+++ b/paddle/framework/tensor_array.cc
@@ -0,0 +1,283 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+
+
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor_array.h"
+
+#include <glog/logging.h>
+#include <algorithm>
+#include <limits>
+
+namespace paddle {
+namespace framework {
+
+namespace detail {
+
+/*
+ * Offer an iterator over the length-sorted lod-tensor's top level. The top
+ * level of a lod-tensor stores batch-size of sequences, each top-level sequence
+ * may contains several lower-level sequences, sort top-level lod by the numbers
+ * of lower-level sequences in descending order, so that during RNN's running,
+ * the batch-size will keep decreasing, the short sentences will end at the tail
+ * of each batch.
+ *
+ * Let's take a simple lod-tensor for example
+ *
+ *   |(0)       |(1)        top-level has two instances
+ *   |||        |||||    lower-level
+ *
+ * sort by lower-level's length
+ *
+ *   |(1)       |(0)
+ *   |||||      |||
+ *
+ * when RNN runs, it get 5 batches (equals the number of elements the longest
+ * sequence has)
+ *
+ * |||||
+ * |||
+ *
+ * the first three batches has two elements, the last two elements just has 1
+ * element each.
+ */
+struct DynamicBatchUnpacker {
+  using value_type = float;
+
+  DynamicBatchUnpacker(const LoDTensor& source, size_t level,
+                       bool descend = true)
+      : source(&source), level(level) {
+    BuildLengthSortedMeta(descend);
+  }
+
+  LoDTensor GetBatch(size_t index);
+
+  std::vector<DySeqMeta> meta;
+
+  LoDTensor const* source;
+  size_t level;
+
+ protected:
+  void BuildLengthSortedMeta(bool descend);
+};
+
+LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
+                           const std::vector<DySeqMeta>& meta, const LoD& lod,
+                           size_t level);
+
+}  // namespace detail
+
+const LoDTensor& TensorArray::Read(size_t index) const {
+  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
+  if (index >= size()) {
+    values_.resize(index + 1);
+  }
+  return values_[index];
+}
+
+void TensorArray::Write(size_t index, const LoDTensor& value) {
+  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
+
+  if (index >= size()) {
+    values_.resize(index + 1);
+  }
+
+  values_[index].Resize(value.dims());
+  values_[index].mutable_data<value_type>(platform::CPUPlace());
+  values_[index].CopyFrom<value_type>(value, platform::CPUPlace());
+}
+
+void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
+  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
+  if (index >= size()) {
+    values_.resize(index + 1);
+  }
+
+  values_[index].ShareDataWith<value_type>(value);
+}
+
+LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
+                            const LoD& lod) const {
+  return detail::PackDynamicBatch(values_, meta, lod, level);
+}
+
+std::vector<DySeqMeta> TensorArray::Unpack(const LoDTensor& source, int level,
+                                           bool length_desend) {
+  detail::DynamicBatchUnpacker unpacker(source, level,
+                                        length_desend /*descend*/);
+
+  // find max length of all the sequences
+  size_t max_length = 0;
+  for (const auto& seq : unpacker.meta) {
+    max_length = std::max(max_length, seq.end - seq.begin);
+  }
+
+  // write batches to values
+  for (size_t batch_id = 0; batch_id < max_length; batch_id++) {
+    Write(batch_id, unpacker.GetBatch(batch_id));
+  }
+
+  return unpacker.meta;
+}
+
+LoDTensor TensorArray::Stack() const {
+  LoDTensor result;
+  if (size() == 0) return result;
+
+  const auto& first_dims = values_.front().dims();
+  // check all the values have the same shape
+  // TODO(superjom) check the same dtypes
+  for (size_t idx = 1; idx < size(); idx++) {
+    const auto& value_dims = values_[idx].dims();
+    PADDLE_ENFORCE_EQ(first_dims, value_dims);
+  }
+
+  // copy
+  auto result_dims = vectorize(first_dims);
+  result_dims.insert(result_dims.begin(), size());
+  result.Resize(make_ddim(result_dims));
+  result.mutable_data<value_type>(platform::CPUPlace());
+
+  for (size_t idx = 0; idx < size(); idx++) {
+    result.Slice<value_type>(idx, idx + 1)
+        .CopyFrom<value_type>(Read(idx), platform::CPUPlace());
+  }
+  return result;
+}
+
+void TensorArray::Unstack(const LoDTensor& source) const {
+  Unstack(source, false /*data_shared*/);
+}
+
+void TensorArray::UnstackShared(const LoDTensor& source) const {
+  Unstack(source, true /*data_shared*/);
+}
+
+void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
+  size_t first_dim = source.dims()[0];
+  DDim value_dims = slice_ddim(source.dims(), 1, source.dims().size());
+  PADDLE_ENFORCE_GT(first_dim, 0,
+                    "source should have some data to be unstacked");
+
+  values_.resize(first_dim);
+
+  for (size_t elem = 0; elem < first_dim; elem++) {
+    // create a new value
+    auto& value = values_[elem];
+    if (data_shared) {
+      // share memory
+      value.ShareDataWith<value_type>(source.Slice<value_type>(elem, elem + 1));
+    } else {
+      // copy
+      value.Resize(value_dims);
+      value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1),
+                                 platform::CPUPlace());
+    }
+  }
+}
+
+size_t TensorArray::size() const { return values_.size(); }
+
+namespace detail {
+
+void DynamicBatchUnpacker::BuildLengthSortedMeta(bool descend) {
+  PADDLE_ENFORCE(meta.empty(), "duplicate build meta");
+  // collect meta for each sequence in some level
+  auto lod = SliceLevels(source->lod(), level, level + 1)[0];
+
+  for (size_t seq_id = 0; seq_id < lod.size() - 1; seq_id++) {
+    DySeqMeta seq_meta({lod[seq_id], lod[seq_id + 1], seq_id});
+    meta.push_back(seq_meta);
+  }
+
+  PADDLE_ENFORCE_GT(meta.size(), 0, "meta is empty");
+
+  // sort by length
+  sort(meta.begin(), meta.end(),
+       [descend](const DySeqMeta& a, const DySeqMeta& b) {
+         bool a_ge_b = (a.end - a.begin) > (b.end - b.begin);
+         return descend ? a_ge_b : !a_ge_b;
+       });
+}
+
+LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
+  PADDLE_ENFORCE(!meta.empty(), "should build meta first");
+  LoDTensor result;
+
+  // collect indice need to copy to the batch
+  std::vector<size_t> indice;
+  for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) {
+    const auto& seq_meta = meta[seq_id];
+    if (index >= seq_meta.end) break;
+    indice.push_back(seq_meta.begin + index);
+  }
+
+  PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);
+
+  // copy the indice of records in LoDTensor
+  auto record_dims = slice_ddim(source->dims(), 1, source->dims().size());
+  auto record_dims_vec = vectorize(record_dims);
+  record_dims_vec.insert(record_dims_vec.begin(), indice.size());
+  result.Resize(make_ddim(record_dims_vec));
+  result.mutable_data<value_type>(platform::CPUPlace());
+
+  for (size_t i = 0; i < indice.size() - 1; i++) {
+    auto index = indice[i];
+    auto target = result.Slice<value_type>(i, i + 1);
+    auto source_ = source->Slice<value_type>(index, index + 1);
+    target.CopyFrom<value_type>(source_, platform::CPUPlace());
+  }
+
+  return result;
+}
+
+LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
+                           const std::vector<DySeqMeta>& meta, const LoD& lod,
+                           size_t level) {
+  PADDLE_ENFORCE(!source.empty());
+  PADDLE_ENFORCE(!meta.empty());
+  PADDLE_ENFORCE(!lod.empty());
+
+  LoDTensor result;
+
+  // init result space
+  auto record_dims = slice_ddim(source[0].dims(), 1, source[0].dims().size());
+  auto record_dims_vec = vectorize(record_dims);
+  auto height = lod[level].back();
+  record_dims_vec.insert(record_dims_vec.begin(), height);
+  result.Resize(make_ddim(record_dims_vec));
+  result.mutable_data<float>(platform::CPUPlace());
+
+  for (size_t batch_id = 0; batch_id < source.size(); batch_id++) {
+    for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) {
+      const auto& seq_meta = meta[seq_id];
+      // source is source[batch_id][seq_id]
+      // target is result[index]
+      auto index = seq_meta.begin + batch_id;
+      if (index >= seq_meta.end) break;
+      auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1);
+      auto target = result.Slice<float>(index, index + 1);
+      target.CopyFrom<float>(source_, platform::CPUPlace());
+    }
+  }
+
+  result.set_lod(lod);
+
+  return result;
+}
+
+}  // namespace detail
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h
new file mode 100644
index 0000000000..e76f33d2c0
--- /dev/null
+++ b/paddle/framework/tensor_array.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <vector>
+
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+ * DyBatchSeqPosition stores indices of the basic element in tensor. It is used
+ * after lod-tensor's re-assembling, its info can be used to recover the order
+ * in original lod-tensor.
+ */
+struct DySeqMeta {
+  size_t begin;
+  size_t end;  // not included
+  size_t ori_idx;
+};
+
+/*
+ * TensorArray is a C-array-like array of tensors, it is meant to be used with
+ * dynamic iteration primitives such as while_loop. It is used to segment inputs
+ * and store states in all time steps.
+ *
+ * By providing some methods similar to a C++ array, the difinition of some
+ * state-based dynamic models such as RNN cound be more natural and highly
+ * flexible.
+ */
+class TensorArray {
+ public:
+  using value_type = float;
+
+  // max number of values allowed to store.
+  const size_t MAX_SIZE{100000};
+
+  /*
+   * Inputs:
+   *   - value_shared: share memory between tensors.
+   */
+  explicit TensorArray(bool values_shared = true)
+      : values_shared_(values_shared) {}
+
+  /*
+   * Read the value at location `index` in the `TensorArray`.
+   */
+  const LoDTensor &Read(size_t index) const;
+
+  /*
+   * Write value into the index of the TensorArray.
+   */
+  void Write(size_t index, const LoDTensor &value);
+
+  /*
+   * Write value into the index of the TensorArray, with memory shared.
+   */
+  void WriteShared(size_t index, const LoDTensor &value);
+
+  /*
+   * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
+   * `indice_map`.
+   */
+  LoDTensor Pack(size_t level, const std::vector<DySeqMeta> &meta,
+                 const LoD &lod) const;
+
+  /*
+   * Split LoDTensor in some `level` and write the generated batches to
+   * `values`, if set `desend`, will sort by length in descending order else in
+   * ascending order.
+   */
+  std::vector<DySeqMeta> Unpack(const LoDTensor &source, int level,
+                                bool length_desend);
+
+  /*
+   * Pack the values into a tensor with rank one higher than each tensor in
+   * values.
+   */
+  LoDTensor Stack() const;
+
+  /*
+   * Unpacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors.
+   */
+  void Unstack(const LoDTensor &source) const;
+
+  /*
+   * Unpacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors,
+   * with memory of tensors shared.
+   */
+  void UnstackShared(const LoDTensor &source) const;
+
+  /*
+   * Return the number of values.
+   */
+  size_t size() const;
+
+ protected:
+  void Unstack(const LoDTensor &source, bool data_shared) const;
+
+ private:
+  mutable std::vector<LoDTensor> values_;
+  bool values_shared_;
+};  // class TensorArray
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc
new file mode 100644
index 0000000000..d9f52509cd
--- /dev/null
+++ b/paddle/framework/tensor_array_test.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor_array.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+
+class TensorArrayTester : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    LoDTensor source;
+    source.Resize(make_ddim({batch_size, dim}));
+    int* data = source.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 16 * 32; i++) {
+      data[i] = i;
+    }
+    ta.Unstack(source);
+  }
+
+  TensorArray ta;
+  const int batch_size = 16;
+  const int dim = 32;
+};
+
+TEST_F(TensorArrayTester, Read) {
+  for (int i = 0; i < batch_size; i++) {
+    const auto& tensor = ta.Read(i);
+    ASSERT_EQ(tensor.dims()[0], 1);
+    ASSERT_EQ(tensor.dims()[1], dim);
+  }
+}
+
+TEST_F(TensorArrayTester, Write) {
+  LoDTensor source;
+  source.Resize(make_ddim({1, dim}));
+  for (int i = 0; i < dim; i++) {
+    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
+  }
+
+  ta.Write(2, source);
+
+  const auto& tensor = ta.Read(2);
+  for (int i = 0; i < dim; i++) {
+    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
+  }
+}
+
+TEST_F(TensorArrayTester, WriteShared) {
+  LoDTensor source;
+  source.Resize(make_ddim({1, dim}));
+  for (int i = 0; i < dim; i++) {
+    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
+  }
+
+  ta.WriteShared(2, source);
+
+  const auto& tensor = ta.Read(2);
+  for (int i = 0; i < dim; i++) {
+    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
+  }
+
+  EXPECT_EQ(source.data<int>(), tensor.data<int>());
+}
+
+class TensorArrayPackTester : public ::testing::Test {
+ protected:
+  virtual void SetUp() override {
+    lod.push_back(std::vector<size_t>{0, 2, 9, 13});
+
+    source.set_lod(lod);
+    source.Resize(make_ddim({13, 128}));
+    source.mutable_data<int>(platform::CPUPlace());
+
+    // content of each setence: 0 1 2 3 4
+    const auto& level = lod.front();
+    for (size_t i = 0; i < level.size() - 1; i++) {
+      size_t begin = level[i];
+      size_t end = level[i + 1];
+      for (size_t j = begin; j < end; j++) {
+        auto record = source.Slice<int>(j, j + 1);
+        for (int dim = 0; dim < 128; dim++) {
+          record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
+        }
+      }
+    }
+
+    // unpack
+    meta = ta.Unpack(source, 0, true);
+  }
+
+  LoD lod;
+  TensorArray ta;
+  LoDTensor source;
+  std::vector<DySeqMeta> meta;
+};
+
+TEST_F(TensorArrayPackTester, Unpack) {
+  ASSERT_EQ(ta.size(), 7UL);
+
+  const auto& t0 = ta.Read(0);
+  const auto& t1 = ta.Read(1);
+
+  ASSERT_EQ(t0.data<int>()[0], int(0));
+  ASSERT_EQ(t1.data<int>()[0], int(1));
+}
+
+TEST_F(TensorArrayPackTester, Pack) {
+  LoDTensor packed = ta.Pack(0, meta, lod);
+}
+
+TEST_F(TensorArrayTester, size) {
+  ASSERT_EQ(ta.size(), static_cast<size_t>(batch_size));
+}
+
+}  // namespace framework
+}  // namespace paddle

From 37bbaabdf1abfe14e19cf7dcb7a842a10b36d1c8 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 2 Oct 2017 14:17:17 -0700
Subject: [PATCH 07/15] "fix conflict"

---
 paddle/framework/op_info.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index 7940922b09..9672e540c8 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -20,7 +20,7 @@
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/type_defs.h"
-
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {

From 6b051b651ae72305d9877fd3cd094028c21bdddb Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Oct 2017 14:24:03 -0700
Subject: [PATCH 08/15] optimize code

---
 paddle/operators/recurrent_op.cc           | 38 ++++++++++++----------
 paddle/operators/recurrent_op.h            |  4 +--
 paddle/operators/rnn/recurrent_op_utils.cc |  8 ++---
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 016e2043fd..bcd6a3410a 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -32,24 +32,25 @@ void RecurrentAlgorithm::Run(const Scope& scope,
                              const platform::DeviceContext& dev_ctx) const {
   auto* input0 = scope.FindVar(arg_->inlinks[0]);
   PADDLE_ENFORCE_NOT_NULL(input0);
-  seq_len_ = input0->GetMutable<LoDTensor>()->dims()[0];
-  PADDLE_ENFORCE_GT(seq_len_, 0);
+  size_t seq_len = input0->GetMutable<LoDTensor>()->dims()[0];
+  PADDLE_ENFORCE_GT(seq_len, 0);
 
-  CreateScopes(scope);
+  CreateScopes(scope, seq_len);
   auto& step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
   InitMemories(step_scopes[0]);
 
-  for (size_t i = 0; i < seq_len_; i++) {
-    if (i > 0) {
-      rnn::LinkMemories(step_scopes, arg_->memories, i, -1);
+  for (size_t step_id = 0; step_id < seq_len; step_id++) {
+    if (step_id > 0) {
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1);
     }
-    (*stepnet_)->Run(*step_scopes[i], dev_ctx);
+    (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_);
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len);
 }
 
-void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
+void RecurrentAlgorithm::CreateScopes(const Scope& scope,
+                                      size_t seq_len) const {
   // TODO(superjom) Only two scopes are needed for inference, this case will be
   // supported later.
   auto step_scopes_var = scope.FindVar(arg_->step_scopes);
@@ -60,8 +61,8 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
   PADDLE_ENFORCE_NOT_NULL(stepnet_);
   PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs");
 
-  if (seq_len_ > step_scopes->size()) {
-    for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
+  if (seq_len > step_scopes->size()) {
+    for (size_t i = step_scopes->size(); i < seq_len; ++i) {
       auto& step_scope = scope.NewScope();
 
       // create step net's temp inputs
@@ -144,17 +145,18 @@ class RecurrentAlgorithmProtoAndCheckerMaker
 
 void RecurrentGradientAlgorithm::Run(
     const Scope& scope, const platform::DeviceContext& dev_ctx) const {
-  seq_len_ =
-      scope.FindVar(arg_->inlinks[0])->GetMutable<LoDTensor>()->dims()[0];
+  auto* input0 = scope.FindVar(arg_->inlinks[0]);
+  PADDLE_ENFORCE_NOT_NULL(input0);
+  size_t seq_len = input0->GetMutable<LoDTensor>()->dims()[0];
   auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);
-  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
-    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
+  for (int step_id = seq_len - 1; step_id >= 0; --step_id) {
+    if (step_id != seq_len - 1) {
       rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
     }
     (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_);
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len);
   LinkBootMemoryGradients(step_scopes[0]);
 }
 
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index 752025e42c..253d7e3284 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -48,7 +48,7 @@ class RecurrentAlgorithm {
    * NOTE the scopes are reused in both the forward and backward, so just
    * create once and expand its size if more steps need.
    */
-  void CreateScopes(const framework::Scope& scope) const;
+  void CreateScopes(const framework::Scope& scope, size_t seq_len) const;
 
   const std::vector<framework::Scope*>& GetStepScopes(
       const framework::Scope& scope) const {
@@ -61,7 +61,6 @@ class RecurrentAlgorithm {
  private:
   std::unique_ptr<framework::OperatorBase>* stepnet_;
   rnn::Argument* arg_;
-  mutable size_t seq_len_;
 };
 
 class RecurrentGradientAlgorithm {
@@ -97,7 +96,6 @@ class RecurrentGradientAlgorithm {
 
  private:
   rnn::Argument* arg_;
-  mutable size_t seq_len_;
   std::unique_ptr<framework::OperatorBase>* stepnet_;
 };
 
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index a02994f99d..a37d21d480 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -53,12 +53,12 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& outlinks,
                    const size_t seq_len) {
   for (size_t i = 0; i < outlinks.size(); i++) {
-    auto output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
+    auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
     PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.",
                             outlinks[i]);
     LoDTensor* output = output_var->GetMutable<LoDTensor>();
 
-    auto step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
+    auto* step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
     PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
     f::DDim step_dims =
         step_scope_var->template GetMutable<LoDTensor>()->dims();
@@ -89,8 +89,8 @@ void LinkMemories(const std::vector<Scope*>& scopes,
       step_id + offset, scopes.size(),
       "offset [%d] is out of range, it must be less than (%d - %d)", offset,
       scopes.size(), step_id);
-  auto scope = scopes[step_id];
-  auto linked_scope = scopes[step_id + offset];
+  auto* scope = scopes[step_id];
+  auto* linked_scope = scopes[step_id + offset];
   for (auto& attr : memories) {
     auto mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
     auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();

From cde542e6524d8fd084983e20e0051a3caf22f6b1 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Oct 2017 14:51:15 -0700
Subject: [PATCH 09/15] optimize auto

---
 paddle/operators/recurrent_op.cc           |  6 +++---
 paddle/operators/rnn/recurrent_op_utils.cc | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index bcd6a3410a..04c4c24951 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -53,9 +53,9 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,
                                       size_t seq_len) const {
   // TODO(superjom) Only two scopes are needed for inference, this case will be
   // supported later.
-  auto step_scopes_var = scope.FindVar(arg_->step_scopes);
+  auto* step_scopes_var = scope.FindVar(arg_->step_scopes);
   PADDLE_ENFORCE(step_scopes_var != nullptr, "");
-  auto step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>();
+  auto* step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>();
 
   // Now all variables in scope must be created outside of op.
   PADDLE_ENFORCE_NOT_NULL(stepnet_);
@@ -148,7 +148,7 @@ void RecurrentGradientAlgorithm::Run(
   auto* input0 = scope.FindVar(arg_->inlinks[0]);
   PADDLE_ENFORCE_NOT_NULL(input0);
   size_t seq_len = input0->GetMutable<LoDTensor>()->dims()[0];
-  auto step_scopes = GetStepScopes(scope);
+  auto& step_scopes = GetStepScopes(scope);
   rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
   for (int step_id = seq_len - 1; step_id >= 0; --step_id) {
     if (step_id != seq_len - 1) {
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index a37d21d480..ef317a71f1 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -92,8 +92,8 @@ void LinkMemories(const std::vector<Scope*>& scopes,
   auto* scope = scopes[step_id];
   auto* linked_scope = scopes[step_id + offset];
   for (auto& attr : memories) {
-    auto mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
-    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
+    auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
+    auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
     mem->Resize(linked_mem->dims());
     mem->ShareDataWith<float>(*linked_mem);
   }
@@ -106,11 +106,11 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   arg->inlinks = op.Inputs(name.inlinks);
   arg->outlinks = op.Outputs(name.outlinks);
 
-  auto boot_memories =
+  auto& boot_memories =
       is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories);
   // attributes
-  auto memories = op.Attr<std::vector<std::string>>(name.memories);
-  auto pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
+  auto& memories = op.Attr<std::vector<std::string>>(name.memories);
+  auto& pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
 
   PADDLE_ENFORCE(memories.size() == boot_memories.size(),
                  "the size of memories, boot_memories don't match:%d,%d",

From 9ff1fd41b2e8769d233e160975e036f539cda99f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 2 Oct 2017 15:57:18 -0700
Subject: [PATCH 10/15] Fix MacOS compile error

The private data `tensor_shared_` is not used.
---
 paddle/framework/tensor_array.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h
index e76f33d2c0..22ae6a966f 100644
--- a/paddle/framework/tensor_array.h
+++ b/paddle/framework/tensor_array.h
@@ -47,13 +47,6 @@ class TensorArray {
   // max number of values allowed to store.
   const size_t MAX_SIZE{100000};
 
-  /*
-   * Inputs:
-   *   - value_shared: share memory between tensors.
-   */
-  explicit TensorArray(bool values_shared = true)
-      : values_shared_(values_shared) {}
-
   /*
    * Read the value at location `index` in the `TensorArray`.
    */
@@ -111,7 +104,6 @@ class TensorArray {
 
  private:
   mutable std::vector<LoDTensor> values_;
-  bool values_shared_;
 };  // class TensorArray
 
 }  // namespace framework

From adec0d30fe8454f84b6bc61cc8b0385f6483d0c3 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 2 Oct 2017 16:18:26 -0700
Subject: [PATCH 11/15] Simplify SumOp Kernel

---
 paddle/operators/CMakeLists.txt |  6 +++++-
 paddle/operators/sum_op.cc      | 29 +++++++++++++++--------------
 paddle/operators/sum_op.cu      |  4 +---
 paddle/operators/sum_op.h       | 19 -------------------
 4 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 43eb4de2c1..0fa1fca2bc 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -103,12 +103,16 @@ set(DEPS_OPS
     recurrent_op
     cond_op
     cross_entropy_op
-    softmax_with_cross_entropy_op)
+    softmax_with_cross_entropy_op
+    sum_op)
+
+
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor net_op)
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(sum_op DEPS net_op)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index c54843faa6..7c422b4770 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/operators/sum_op.h"
 #include <vector>
+#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
@@ -57,21 +58,23 @@ or not. But the output only shares the LoD with the first input.
   }
 };
 
-class SumGradOp : public framework::OperatorWithKernel {
+class SumGradOp : public NetOp {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+  SumGradOp(const std::string& type, const framework::VariableNameMap& inputs,
+            const framework::VariableNameMap& outputs,
+            const framework::AttributeMap& attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    auto& x_grad_names = Outputs(framework::GradVarName("X"));
+    auto out_grad_name = this->Input(framework::GradVarName("Out"));
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_grad_names = ctx->Outputs(framework::GradVarName("X"));
-    size_t x_length = x_grad_names.size();
-    std::vector<framework::DDim> x_grad_dims;
-    x_grad_dims.reserve(x_length);
-    for (size_t i = 0; i < x_length; ++i) {
-      x_grad_dims.push_back(out_grad_dims);
+    framework::AttributeMap grad_attrs;
+    grad_attrs["scale"] = 1.0f;
+    for (auto& x_grad_name : x_grad_names) {
+      AppendOp(framework::OpRegistry::CreateOp(
+          "scale", {{"X", {out_grad_name}}}, {{"Out", {x_grad_name}}},
+          grad_attrs));
     }
-    ctx->SetOutputsDim(framework::GradVarName("X"), x_grad_dims);
+    CompleteAddOp(false);
   }
 };
 
@@ -81,5 +84,3 @@ class SumGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(sum, ops::SumOp, ops::SumOpMaker, sum_grad, ops::SumGradOp);
 REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sum_grad,
-                       ops::SumGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
index a465cf3659..7129e6bf62 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@@ -13,6 +13,4 @@ limitations under the License. */
 #include "paddle/operators/sum_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(sum_grad,
-                       ops::SumGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 7e8fbb9e41..91e5da8b40 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -42,24 +42,5 @@ class SumKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
-class SumGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto outs = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-    for (auto out : outs) {
-      out->mutable_data<T>(context.GetPlace());
-    }
-
-    auto place = context.GetEigenDevice<Place>();
-    auto in = EigenVector<T>::Flatten(*input);
-    for (auto out : outs) {
-      auto result = EigenVector<T>::Flatten(*out);
-      result.device(place) = in;
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle

From ff1bfdedc97eb0834745e812abd619581fde7950 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 2 Oct 2017 17:09:25 -0700
Subject: [PATCH 12/15] Fix CRLF in sum_op.cu

---
 paddle/operators/sum_op.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
index 7129e6bf62..b1896d3cd8 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@@ -13,4 +13,4 @@ limitations under the License. */
 #include "paddle/operators/sum_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);

From b3e479da1c9cdb580e4577ebdafc5ec451ca4ed2 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 2 Oct 2017 18:38:49 -0700
Subject: [PATCH 13/15] Fix CI

---
 paddle/framework/grad_op_builder_test.cc | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
index 55c5fa420e..2dbc2e6620 100644
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -39,28 +39,6 @@ class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
 
 namespace f = paddle::framework;
 
-TEST(GradOpBuilder, AddTwo) {
-  std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "sum", {{"X", {"x", "y"}}}, {{"Out", {"out"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_add_op =
-      f::OpRegistry::CreateGradOp(*add_op);
-
-  EXPECT_EQ(grad_add_op->Inputs().size(), 1UL);
-  EXPECT_EQ(grad_add_op->Outputs().size(), 1UL);
-  EXPECT_EQ(grad_add_op->Input(f::GradVarName("Out")), f::GradVarName("out"));
-  auto &outputs = grad_add_op->Outputs(f::GradVarName("X"));
-  EXPECT_EQ(2UL, outputs.size());
-  auto in_output = [&outputs](const std::string &name) {
-    for (auto &output_name : outputs) {
-      if (output_name == name) return true;
-    }
-    return false;
-  };
-
-  EXPECT_TRUE(in_output(f::GradVarName("x")));
-  EXPECT_TRUE(in_output(f::GradVarName("y")));
-}
-
 REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
 REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
 

From 42e7fe05a23067677fe7cf552e9534e329886fbb Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Mon, 2 Oct 2017 20:08:06 -0700
Subject: [PATCH 14/15] Changing learning rate from attribute to input(float)
 (#4568)

* Changing learning rate from attribute to input(float)
* Removing obsolete code
---
 paddle/operators/sgd_op.cc                      |  4 +++-
 paddle/operators/sgd_op.h                       |  2 +-
 paddle/pybind/pybind.cc                         |  7 +++++++
 python/paddle/v2/framework/tests/op_test.py     | 17 +++++++++++------
 python/paddle/v2/framework/tests/test_sgd_op.py |  3 +--
 5 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 3bce95535c..8f9eae4186 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -27,6 +27,8 @@ class SGDOp : public framework::OperatorWithKernel {
                    "Input(param) of SGDOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("grad"),
                    "Input(grad) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("learning_rate"),
+                   "Input(learning_rate) of SGDOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("param_out"),
                    "Output(param_out) of SGDOp should not be null.");
 
@@ -42,9 +44,9 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
   SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("param", "input parameter");
+    AddInput("learning_rate", "learning rate of sgd");
     AddInput("grad", "input gradient");
     AddOutput("param_out", "output parameter");
-    AddAttr<float>("learning_rate", "learning rate of sgd");
     AddComment(R"DOC(
 
 Simplest sgd algorithm.
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index a3fe330894..977d201ced 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -31,7 +31,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
     auto param = ctx.Input<Tensor>("param");
     auto grad = ctx.Input<Tensor>("grad");
     auto param_out = ctx.Output<Tensor>("param_out");
-    float lr = ctx.Attr<float>("learning_rate");
+    float lr = *ctx.Input<float>("learning_rate");
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index f4121e9d71..d480427f59 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -143,6 +143,13 @@ All parameter, weight, gradient are variables in Paddle.
       .def("set_int",
            [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
       .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
+      .def("is_float", [](const Variable &var) { return var.IsType<float>(); })
+      .def("set_float",
+           [](Variable &var, float val) -> void {
+             *var.GetMutable<float>() = val;
+           })
+      .def("get_float",
+           [](const Variable &var) -> float { return var.Get<float>(); })
       .def("get_tensor",
            [](Variable &self) -> LoDTensor * {
              return self.GetMutable<LoDTensor>();
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 75df2eeddf..81067f38bb 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -46,12 +46,17 @@ def create_op(scope, op_type, inputs, outputs, attrs):
 
 def set_input(scope, op, inputs, place):
     def __set_input__(var_name, var):
-        tensor = scope.find_var(var_name).get_tensor()
-        if isinstance(var, tuple):
-            tensor.set_lod(var[1])
-            var = var[0]
-        tensor.set_dims(var.shape)
-        tensor.set(var, place)
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_lod(var[1])
+                var = var[0]
+            tensor.set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
 
     for in_name, in_dup in Operator.get_op_inputs(op.type()):
         if in_name in inputs:
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py
index 64e54d1500..f1125f4edb 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -10,8 +10,7 @@ class TestSGDOp(OpTest):
         g = np.random.random((102, 105)).astype("float32")
         lr = 0.1
 
-        self.inputs = {'param': w, 'grad': g}
-        self.attrs = {'learning_rate': lr}
+        self.inputs = {'param': w, 'grad': g, 'learning_rate': lr}
         self.outputs = {'param_out': w - lr * g}
 
     def test_check_output(self):

From b5dbe88b5ab504f88c6e7eaaa8b27d3965701478 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 2 Oct 2017 20:26:17 -0700
Subject: [PATCH 15/15] follow comments

---
 paddle/framework/CMakeLists.txt   |   2 +-
 paddle/framework/executor.cc      | 159 +++---------------------------
 paddle/framework/executor.h       |  14 ++-
 paddle/framework/executor_test.cc |  12 ++-
 paddle/platform/CMakeLists.txt    |   2 +
 paddle/platform/device.cc         |  59 +++++++++++
 paddle/platform/device.h          |  45 +++++++++
 7 files changed, 139 insertions(+), 154 deletions(-)
 create mode 100644 paddle/platform/device.cc
 create mode 100644 paddle/platform/device.h

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 984fc62aa3..506d0f9833 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -44,5 +44,5 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto)
+cc_library(executor SRCS executor.cc DEPS op_registry device scope framework_proto)
 cc_test(executor_test SRCS executor_test.cc DEPS executor)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index ebe3259bc0..57e177bb0a 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -15,162 +15,31 @@ limitations under the License. */
 #include "paddle/framework/executor.h"
 #include <memory>
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
-#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
 
-class LinearListView;
-class GraphView;
-
-// Immutable view of a ProgramDesc organized for efficient execution.
-class ProgramDescView {
- public:
-  virtual ~ProgramDescView() {}
-  virtual void Initialize(const ProgramDesc*) = 0;
-  static ProgramDescView* Create(bool is_linear);
-};
-
-class LinearListView : public ProgramDescView {
- public:
-  void Initialize(const ProgramDesc*) override;
-
- private:
-  std::vector<std::unique_ptr<OperatorBase>> ops_;
-};
-
-class GraphView : public ProgramDescView {
- public:
-  void Initialize(const ProgramDesc*) override;
-};
-
-ProgramDescView* ProgramDescView::Create(bool is_linear) {
-  if (is_linear) {
-    return new LinearListView();
-  } else {
-    return new GraphView();
-  }
-}
-
-void LinearListView::Initialize(const ProgramDesc* pdesc) {
-  // get a LinearView of ProgramDesc
-  for (auto& block_desc : pdesc->blocks()) {
-    for (auto& op_desc : block_desc.ops()) {
-      ops_.emplace_back(OpRegistry::CreateOp(op_desc));
-    }
+Executor::Executor(const std::vector<platform::Place>& places) {
+  devices_.resize(places.size());
+  for (size_t i = 0; i < places.size(); i++) {
+    devices_[i] = platform::GetDevice(places[i]);
   }
 }
 
-void GraphView::Initialize(const ProgramDesc* pdesc) {
-  // get a GraphView of ProgramDesc
-}
-
-struct Device {
-  platform::CPUDeviceContext* cpu_device_context;
-#ifndef PADDLE_ONLY_CPU
-  platform::CUDADeviceContext* cuda_device_context;
-#endif
-
-#ifndef PADDLE_ONLY_CPU
-  Device(platform::CPUDeviceContext* cpu, platform::CUDADeviceContext* gpu)
-      : cpu_device_context(cpu), cuda_device_context(gpu) {}
-#else
-  explicit Device(platform::CPUDeviceContext* cpu) : cpu_device_context(cpu) {}
-#endif
-};
-
-class ExecutorImpl : public Executor {
- public:
-  ExecutorImpl(Scope* scope, const Device* device, const ProgramDesc* pdesc,
-               bool is_linear)
-      : scope_(scope),
-        device_(device),
-        program_desc_(pdesc),
-        view_(ProgramDescView::Create(is_linear)) {}
-
-  virtual ~ExecutorImpl() {
-    if (view_) delete view_;
-  }
-
-  void Run() override;
-
-  void Initialize();
-
- private:
-  Scope* scope_;
-  const Device* device_;
-  const ProgramDesc* program_desc_;
-  ProgramDescView* view_;
-};
-
-template <typename T, typename... Args>
-std::unique_ptr<T> make_unique(Args&&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-platform::CPUDeviceContext* GetCPUDeviceContext(
-    const platform::CPUPlace& place) {
-  static std::unique_ptr<platform::CPUDeviceContext> g_cpu_device_context =
-      make_unique<platform::CPUDeviceContext>(place);
-  return g_cpu_device_context.get();
-}
-
-#ifndef PADDLE_ONLY_CPU
-platform::CUDADeviceContext* GetCUDADeviceContext(
-    const platform::GPUPlace& place) {
-  static std::unique_ptr<platform::CUDADeviceContext> g_cuda_device_context =
-      make_unique<platform::CUDADeviceContext>(place);
-  return g_cuda_device_context.get();
-}
-#endif
-
-Device* GetDevice(const platform::Place& place) {
-  platform::CPUPlace cpu_place;
-#ifndef PADDLE_ONLY_CPU
-  if (platform::is_gpu_place(place)) {
-    platform::GPUPlace gpu_place = boost::get<platform::GPUPlace>(place);
-    static std::unique_ptr<Device> g_device = make_unique<Device>(
-        GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place));
-    return g_device.get();
-  } else {
-    static std::unique_ptr<Device> g_device =
-        make_unique<Device>(GetCPUDeviceContext(cpu_place), nullptr);
-    return g_device.get();
-  }
-#else
-  static std::unique_ptr<Device> g_device =
-      make_unique<Device>(GetCPUDeviceContext(cpu_place));
-  return g_device.get();
-#endif
-}
-
-framework::Scope* GetScope() {
-  static std::unique_ptr<framework::Scope> g_scope =
-      make_unique<framework::Scope>();
-  return g_scope.get();
-}
-
-Executor* NewLocalExecutor(const platform::Place& place,
-                           const ProgramDesc& pdesc, bool is_linear) {
-  return new ExecutorImpl(GetScope(), GetDevice(place), &pdesc, is_linear);
-}
-
-void ExecutorImpl::Run() {
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
+                   std::vector<Tensor>* outputs) {
   // operators running
-  scope_->NewVar();
-  device_->cpu_device_context->Wait();
+  Scope& local_scope = scope->NewScope();
+  local_scope.NewVar();
+  for (auto device : devices_) {
+    device->cpu_device_context->Wait();
 #ifndef PADDLE_ONLY_CPU
-  if (device_->cuda_device_context) {
-    device_->cuda_device_context->Wait();
-  }
+    if (device->cuda_device_context) {
+      device->cuda_device_context->Wait();
+    }
 #endif
-}
-
-void ExecutorImpl::Initialize() {
-  // Initialize the ProgramDescView
-  view_->Initialize(program_desc_);
+  }
 }
 
 }  // namespace framework
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index 25ef2d4d48..5d6d7f37a6 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -15,18 +15,22 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/framework.pb.h"
-#include "paddle/platform/place.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device.h"
 
 namespace paddle {
 namespace framework {
 
 class Executor {
  public:
-  virtual ~Executor() {}
-  virtual void Run() = 0;
-};
+  explicit Executor(const std::vector<platform::Place>& places);
+  ~Executor() {}
+  void Run(const ProgramDesc&, Scope*, std::vector<Tensor>*);
 
-Executor* NewLocalExecutor(const platform::Place&, const ProgramDesc&, bool);
+ private:
+  std::vector<platform::Device*> devices_;
+};
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 6f8ca38768..51d2dfc1c3 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -19,9 +19,15 @@ using namespace paddle::platform;
 using namespace paddle::framework;
 
 TEST(Executor, Init) {
+  CPUPlace cpu_place1, cpu_place2;
+  std::vector<Place> places;
+  places.push_back(cpu_place1);
+  places.push_back(cpu_place2);
+  Executor* executor = new Executor(places);
+
   ProgramDesc pdesc;
-  CPUPlace cpu_place;
-  Executor* executor = NewLocalExecutor(cpu_place, pdesc, true);
-  executor->Run();
+  Scope s;
+  std::vector<Tensor>* outputs{nullptr};
+  executor->Run(pdesc, &s, outputs);
   delete executor;
 }
\ No newline at end of file
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index daf519b91d..b581937393 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -23,5 +23,7 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
     system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
 
+cc_library(device SRCS device.cc DEPS device_context)
+
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
diff --git a/paddle/platform/device.cc b/paddle/platform/device.cc
new file mode 100644
index 0000000000..7acd87c8c3
--- /dev/null
+++ b/paddle/platform/device.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device.h"
+
+namespace paddle {
+namespace platform {
+
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+CPUDeviceContext* GetCPUDeviceContext(const CPUPlace& place) {
+  static std::unique_ptr<CPUDeviceContext> g_cpu_device_context =
+      make_unique<CPUDeviceContext>(place);
+  return g_cpu_device_context.get();
+}
+
+#ifndef PADDLE_ONLY_CPU
+CUDADeviceContext* GetCUDADeviceContext(const GPUPlace& place) {
+  static std::unique_ptr<CUDADeviceContext> g_cuda_device_context =
+      make_unique<CUDADeviceContext>(place);
+  return g_cuda_device_context.get();
+}
+#endif
+
+Device* GetDevice(const Place& place) {
+  CPUPlace cpu_place;
+#ifndef PADDLE_ONLY_CPU
+  if (is_gpu_place(place)) {
+    GPUPlace gpu_place = boost::get<GPUPlace>(place);
+    static std::unique_ptr<Device> g_device = make_unique<Device>(
+        GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place));
+    return g_device.get();
+  } else {
+    static std::unique_ptr<Device> g_device =
+        make_unique<Device>(GetCPUDeviceContext(cpu_place), nullptr);
+    return g_device.get();
+  }
+#else
+  static std::unique_ptr<Device> g_device =
+      make_unique<Device>(GetCPUDeviceContext(cpu_place));
+  return g_device.get();
+#endif
+}
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device.h b/paddle/platform/device.h
new file mode 100644
index 0000000000..b1bb8073cf
--- /dev/null
+++ b/paddle/platform/device.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+struct Device {
+  CPUDeviceContext* cpu_device_context;
+#ifndef PADDLE_ONLY_CPU
+  CUDADeviceContext* cuda_device_context;
+#endif
+
+#ifndef PADDLE_ONLY_CPU
+  Device(CPUDeviceContext* cpu, CUDADeviceContext* gpu)
+      : cpu_device_context(cpu), cuda_device_context(gpu) {}
+#else
+  explicit Device(CPUDeviceContext* cpu) : cpu_device_context(cpu) {}
+#endif
+};
+
+CPUDeviceContext* GetCPUDeviceContext(const platform::CPUPlace& place);
+
+#ifndef PADDLE_ONLY_CPU
+CUDADeviceContext* GetCUDADeviceContext(const platform::GPUPlace& place);
+#endif
+
+Device* GetDevice(const platform::Place& place);
+}  // namespace platform
+}  // namespace paddle