From d3d16f76f583ca3f46a13e62f6f670acdcccbb5c Mon Sep 17 00:00:00 2001
From: ying <lcy.seso@gmail.com>
Date: Wed, 7 Mar 2018 09:39:53 +0800
Subject: [PATCH 01/57] enhance reshape operator.

---
 paddle/fluid/operators/reshape_op.cc          | 97 ++++++++++++-------
 paddle/fluid/operators/reshape_op.h           | 48 ++++++++-
 .../paddle/fluid/tests/unittests/op_test.py   |  8 +-
 .../unittests/test_mine_hard_examples_op.py   |  0
 .../fluid/tests/unittests/test_reshape_op.py  | 56 +++++++----
 .../tests/unittests/test_target_assign_op.py  |  0
 6 files changed, 150 insertions(+), 59 deletions(-)
 mode change 100755 => 100644 python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
 mode change 100755 => 100644 python/paddle/fluid/tests/unittests/test_target_assign_op.py
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 3580932356..c47df73405 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -31,48 +31,69 @@ class ReshapeOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ReshapeOp should not be null.");
 
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+
+    PADDLE_ENFORCE_EQ(shape.empty(), ctx->HasInput("Shape"),
+                      "The shape information can only be set by Attr(shape) or "
+                      "by Input(Shape). Attr(shape) and Input(Shape) cannot be "
+                      "set at the same time.");
+
     auto x_dims = ctx->GetInputDim("X");
 
-    std::vector<size_t> neg_dims_idx;
-    // set some dimension to -1 if it is unknown
-    const int unknown_size = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
-                     "Each dimension of Attr(shape) must be positive or %d.",
-                     unknown_size);
-      if (shape[i] == unknown_size) {
-        neg_dims_idx.push_back(i);
-        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
-                       "Only one dimension of Attr(shape) can be unknown.");
-      }
-    }
+    if (ctx->HasInput("Shape")) {
+      auto shape_dims = ctx->GetInputDim("Shape");
 
-    int64_t capacity =
-        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    int64_t in_size = framework::product(x_dims);
-    if (neg_dims_idx.size() == 1) {
-      // dim infer
-      shape[neg_dims_idx[0]] = in_size / (-capacity);
-      // recalculate capacity
-      capacity = shape[neg_dims_idx[0]] * (-capacity);
+      PADDLE_ENFORCE(shape_dims.size() == 2UL && shape_dims[0] == 1UL,
+                     "The Input(Label) should be a 2-D tensor with the 1st "
+                     "dimensions fixed to 1 (a row vector).");
+
+      // The actual output shape will be set at runtime, here temporially the
+      // the shape of output the same as the shape of input.
+      ctx->SetOutputDim("Out", x_dims);
+    } else {
+      std::vector<int64_t> output_shape;
+      ValidateShape(shape, framework::product(x_dims), output_shape);
+
+      auto out_dims = framework::make_ddim(output_shape);
+      ctx->SetOutputDim("Out", out_dims);
     }
-    // capacity check
-    PADDLE_ENFORCE(capacity == in_size,
-                   "The size of Input(X) mismatches with Attr(shape).");
-    // resize output
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto out_dims = framework::make_ddim(shape_int64);
-    ctx->SetOutputDim("Out", out_dims);
+
     if (shape[0] == x_dims[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
+      // Only pass LoD when the first dimension of output and input are the
+      // same.
       ctx->ShareLoD("X", /*->*/ "Out");
     }
   }
+
+ private:
+  void ValidateShape(const std::vector<int> &shape, const int64_t in_size,
+                     std::vector<int64_t> &output_shape) const {
+    std::vector<size_t> neg_dims_idx;
+    const int unknown_index = -1;  // only one dimension canbe set to -1, whose
+                                   // size will be automatically infered.
+
+    for (size_t i = 0; i < shape.size(); ++i) {
+      PADDLE_ENFORCE(shape[i] > 1 || shape[i] == unknown_index,
+                     "Each input dimension of Attr(shape) must be positive, or "
+                     "only one input dimension can be -1.");
+      if (shape[i] == unknown_index) neg_dims_idx.push_back(i);
+    }
+    PADDLE_ENFORCE_LE(
+        neg_dims_idx.size(), 1,
+        "Only one input dimension of Attr(shape) may be unknown.");
+
+    int64_t inferred_dim = 0;
+    if (neg_dims_idx.size()) {
+      int64_t capacity = std::accumulate(shape.begin(), shape.end(), 1,
+                                         std::multiplies<int>());
+      inferred_dim = in_size / (-capacity);
+    }
+
+    output_shape.resize(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), output_shape.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    if (neg_dims_idx.size()) output_shape[neg_dims_idx[0]] = inferred_dim;
+  }
 };
 
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -80,10 +101,12 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
   ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of reshape operator.");
+    AddInput("Shape", "a 1-D tensor that provides the shape information.")
+        .AsDispensable();
     AddOutput("Out", "The output tensor of reshape operator.");
     AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "Target shape of reshape operator.");
+                              "(vector<int>) Target shape of reshape operator.")
+        .SetDefault(std::vector<int>());
     AddComment(R"DOC(
 Reshape Operator.
 
@@ -96,7 +119,7 @@ and target shape = [1, 4], the reshape operator will transform
 the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
 
 One dimension in the target shape can be set -1, representing that its
-size is unknown. In this case, the real dimension will be infered from 
+size is unknown. In this case, the real dimension will be infered from
 the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
   }
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index 1357bce4b7..fc0885c149 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -26,11 +26,57 @@ class ReshapeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out = ctx.Output<framework::Tensor>("Out");
     auto* in = ctx.Input<framework::Tensor>("X");
-    auto out_dims = out->dims();
+
+    auto* shape = ctx.Input<framework::Tensor>("Shape");
+    framework::DDim out_dims;
+    if (shape) {
+      std::vector<int64_t> output_shape;
+      ValidateShape(*shape, framework::product(in->dims()), output_shape);
+
+      for (auto d : output_shape) std::cout << d << " ";
+      std::cout << std::endl;
+
+      out_dims = framework::make_ddim(output_shape);
+    } else {
+      out_dims = out->dims();
+    }
+
     out->mutable_data<T>(ctx.GetPlace());
     framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
     out->Resize(out_dims);
   }
+
+ private:
+  void ValidateShape(const framework::Tensor& shape, const int64_t in_size,
+                     std::vector<int64_t>& output_shape) const {
+    std::vector<size_t> neg_dims_idx;
+    const int unknown_index = -1;  // only one dimension canbe set to -1, whose
+                                   // size will be automatically infered.
+
+    const int64_t dimension = shape.dims()[1];
+    std::cout << "dimension =" << dimension << std::endl;
+    const T* shape_data = shape.data<T>();
+
+    for (int64_t i = 0; i < dimension; ++i) {
+      PADDLE_ENFORCE(shape_data[i] > 1 || shape_data[i] == unknown_index,
+                     "Each input dimension of Attr(shape) must be positive, or "
+                     "only one input dimension can be -1.");
+      if (shape_data[i] == unknown_index) neg_dims_idx.push_back(i);
+    }
+    PADDLE_ENFORCE_LE(
+        neg_dims_idx.size(), 1,
+        "Only one input dimension of Attr(shape) can be unknown.");
+
+    int64_t capacity = 1;
+    output_shape.resize(dimension, 0);
+    for (int64_t i = 0; i < dimension; ++i) {
+      capacity *= shape_data[i];
+      output_shape[i] = static_cast<int64_t>(shape_data[i]);
+    }
+
+    if (neg_dims_idx.size())
+      output_shape[neg_dims_idx[0]] = in_size / (-capacity);
+  }
 };
 
 template <typename DeviceContext, typename T>
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index f7e02595ec..26835336ad 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + str(expect_t))
+                    str(actual_t) + "\n" + str(expect_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.lod(), expect[1],
                                          "Output (" + out_name +
@@ -546,6 +546,6 @@ class OpTest(unittest.TestCase):
 
         fetch_list = [g for p, g in param_grad_list]
         executor = Executor(place)
-        return map(
-            np.array,
-            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
+        return map(np.array,
+                   executor.run(prog, feed_dict, fetch_list,
+                                return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 6d1aa549d5..ae1cca0c3e 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -14,29 +14,51 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
-
+import pdb
 
-class TestReshapeOp(OpTest):
-    def setUp(self):
-        self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+from op_test import OpTest
 
-    def test_check_output(self):
-        self.check_output()
+# class TestReshapeOp1(OpTest):
+#     def setUp(self):
+#         ori_shape = (2, 25)
+#         new_shape = [5, 10]
+# 
+#         self.op_type = "reshape"
+#         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+#         self.attrs = {"shape": new_shape}
+#         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+# 
+#     def test_check_output(self):
+#         self.check_output()
+# 
+#     def test_check_grad(self):
+#         self.check_grad(["X"], "Out")
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+# class TestReshapeOpDimInfer1(OpTest):
+#     def setUp(self):
+#         self.op_type = "reshape"
+#         self.inputs = {"X": np.random.random((5, 10)).astype("float32")}
+#         self.attrs = {"shape": [5, -1, 5]}
+#         self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
+# 
+#     def test_check_output(self):
+#         self.check_output()
+# 
+#     def test_check_grad(self):
+#         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInfer(OpTest):
+class TestReshapeOp2(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = ([5, 10], )
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {
+            "X": np.random.random(ori_shape).astype("float32"),
+            "Shape": np.array(new_shape)
+        }
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape[0])}
 
     def test_check_output(self):
         self.check_output()
@@ -45,5 +67,5 @@ class TestReshapeOpDimInfer(OpTest):
         self.check_grad(["X"], "Out")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
old mode 100755
new mode 100644

From 1d4dfc096666fd2c482969a44b188faa4362f064 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Mon, 12 Mar 2018 10:28:22 +0800
Subject: [PATCH 02/57] fix bugs.

---
 paddle/fluid/operators/reshape_op.cc          | 39 ++++++++++++++-----
 paddle/fluid/operators/reshape_op.h           | 14 ++++---
 .../fluid/tests/unittests/test_reshape_op.py  | 33 +++++++++++++++-
 3 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index c47df73405..2ad49437a9 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -32,7 +32,6 @@ class ReshapeOp : public framework::OperatorWithKernel {
                    "Output(Out) of ReshapeOp should not be null.");
 
     const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-
     PADDLE_ENFORCE_EQ(shape.empty(), ctx->HasInput("Shape"),
                       "The shape information can only be set by Attr(shape) or "
                       "by Input(Shape). Attr(shape) and Input(Shape) cannot be "
@@ -41,27 +40,29 @@ class ReshapeOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
 
     if (ctx->HasInput("Shape")) {
+      // The shape information in given by Input(Shape).
       auto shape_dims = ctx->GetInputDim("Shape");
 
       PADDLE_ENFORCE(shape_dims.size() == 2UL && shape_dims[0] == 1UL,
                      "The Input(Label) should be a 2-D tensor with the 1st "
                      "dimensions fixed to 1 (a row vector).");
 
-      // The actual output shape will be set at runtime, here temporially the
+      // The actual output shape will be set at runtime, here temporially set
       // the shape of output the same as the shape of input.
       ctx->SetOutputDim("Out", x_dims);
     } else {
+      // The shape information in given by Attr(shape).
       std::vector<int64_t> output_shape;
       ValidateShape(shape, framework::product(x_dims), output_shape);
 
       auto out_dims = framework::make_ddim(output_shape);
       ctx->SetOutputDim("Out", out_dims);
-    }
 
-    if (shape[0] == x_dims[0]) {
-      // Only pass LoD when the first dimension of output and input are the
-      // same.
-      ctx->ShareLoD("X", /*->*/ "Out");
+      if (shape[0] == x_dims[0]) {
+        // Only pass LoD when the first dimension of output and Input(X)
+        // are the same.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
     }
   }
 
@@ -94,6 +95,14 @@ class ReshapeOp : public framework::OperatorWithKernel {
                    [](int a) { return static_cast<int64_t>(a); });
     if (neg_dims_idx.size()) output_shape[neg_dims_idx[0]] = inferred_dim;
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -101,11 +110,13 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
   ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of reshape operator.");
-    AddInput("Shape", "a 1-D tensor that provides the shape information.")
+    AddInput(
+        "Shape",
+        "Tensor<int64_t>, a 1-D tensor that provides the shape information.")
         .AsDispensable();
     AddOutput("Out", "The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) Target shape of reshape operator.")
+    AddAttr<std::vector<int>>(
+        "shape", "(std::vector<int>) Target shape of reshape operator.")
         .SetDefault(std::vector<int>());
     AddComment(R"DOC(
 Reshape Operator.
@@ -139,6 +150,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) shouldn't be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index fc0885c149..0c97dc639f 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -33,9 +33,6 @@ class ReshapeKernel : public framework::OpKernel<T> {
       std::vector<int64_t> output_shape;
       ValidateShape(*shape, framework::product(in->dims()), output_shape);
 
-      for (auto d : output_shape) std::cout << d << " ";
-      std::cout << std::endl;
-
       out_dims = framework::make_ddim(output_shape);
     } else {
       out_dims = out->dims();
@@ -85,11 +82,18 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     d_x->mutable_data<T>(ctx.GetPlace());
+    bool inplace = ctx.Attr<bool>("inplace");
 
     auto in_dims = d_x->dims();
-    framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-    d_x->Resize(in_dims);
+    if (!inplace) {
+      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      d_x->Resize(in_dims);
+    } else {
+      d_x->ShareDataWith(*d_out);
+      d_x->Resize(in_dims);
+    }
   }
 };
 }  // namespace operators
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index ae1cca0c3e..dc96aed8db 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -33,7 +33,8 @@ from op_test import OpTest
 # 
 #     def test_check_grad(self):
 #         self.check_grad(["X"], "Out")
-
+# 
+# 
 # class TestReshapeOpDimInfer1(OpTest):
 #     def setUp(self):
 #         self.op_type = "reshape"
@@ -56,7 +57,8 @@ class TestReshapeOp2(OpTest):
         self.op_type = "reshape"
         self.inputs = {
             "X": np.random.random(ori_shape).astype("float32"),
-            "Shape": np.array(new_shape)
+            "Shape": np.array(
+                new_shape, dtype="int64")
         }
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape[0])}
 
@@ -67,5 +69,32 @@ class TestReshapeOp2(OpTest):
         self.check_grad(["X"], "Out")
 
 
+# class TestReshapeOpInplace(OpTest):
+#     def setUp(self):
+#         self.op_type = "reshape"
+#         self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+#         self.attrs = {'shape': [10 * 20], 'inplace': True}
+#         self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+# 
+#     def test_check_output(self):
+#         self.check_output()
+# 
+#     def test_check_grad(self):
+#         self.check_grad(["X"], "Out")
+# 
+# 
+# class TestReshapeOpDimInferInplace(OpTest):
+#     def setUp(self):
+#         self.op_type = "reshape"
+#         self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+#         self.attrs = {'shape': [4, -1, 5], 'inplace': True}
+#         self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+# 
+#     def test_check_output(self):
+#         self.check_output()
+# 
+#     def test_check_grad(self):
+#         self.check_grad(["X"], "Out")
+
 if __name__ == "__main__":
     unittest.main()

From cf081851453a42bb6c7ea707b4f998e208d0e2a1 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Mon, 12 Mar 2018 13:05:47 +0800
Subject: [PATCH 03/57] fix bugs and complete codes.

---
 paddle/fluid/operators/reshape_op.cc          |  94 +++++------
 paddle/fluid/operators/reshape_op.h           |  61 +++----
 python/paddle/fluid/layers/detection.py       |  17 +-
 python/paddle/fluid/layers/nn.py              |  56 +++++++
 python/paddle/fluid/layers/ops.py             |   1 -
 .../fluid/tests/unittests/test_reshape_op.py  | 158 ++++++++++--------
 6 files changed, 220 insertions(+), 167 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index b094e649c3..c0d08cc690 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -25,39 +25,28 @@ class ReshapeOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of ReshapeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ReshapeOp should not be null.");
 
     const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE_EQ(shape.empty(), ctx->HasInput("Shape"),
-                      "The shape information can only be set by Attr(shape) or "
-                      "by Input(Shape). Attr(shape) and Input(Shape) cannot be "
-                      "set at the same time.");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
 
+    std::vector<int64_t> output_shape;
     auto x_dims = ctx->GetInputDim("X");
+    bool need_copy_dim = ValidateShape(shape, x_dims, output_shape);
 
-    if (ctx->HasInput("Shape")) {
-      // The shape information in given by Input(Shape).
-      auto shape_dims = ctx->GetInputDim("Shape");
-
-      PADDLE_ENFORCE(shape_dims.size() == 2UL && shape_dims[0] == 1UL,
-                     "The Input(Label) should be a 2-D tensor with the 1st "
-                     "dimensions fixed to 1 (a row vector).");
-
-      // The actual output shape will be set at runtime, here temporially set
-      // the shape of output the same as the shape of input.
+    if (need_copy_dim) {
+      // Some dimensions can only be determined during runtime. Here temporarily
+      // set output tensor's shape the same as that of the input tensor.
       ctx->SetOutputDim("Out", x_dims);
     } else {
-      // The shape information in given by Attr(shape).
-      std::vector<int64_t> output_shape;
-      ValidateShape(shape, framework::product(x_dims), output_shape);
-
-      auto out_dims = framework::make_ddim(output_shape);
-      ctx->SetOutputDim("Out", out_dims);
+      ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
 
+      // FIXME(caoying): When shape of the output tensor is determined during
+      // runtime, LoD information of X will not passed to the output.
       if (shape[0] == x_dims[0]) {
         // Only pass LoD when the first dimension of output and Input(X)
         // are the same.
@@ -67,41 +56,51 @@ class ReshapeOp : public framework::OperatorWithKernel {
   }
 
  private:
-  void ValidateShape(const std::vector<int> &shape, const int64_t in_size,
+  bool ValidateShape(const std::vector<int> &shape,
+                     const framework::DDim &input_dim,
                      std::vector<int64_t> &output_shape) const {
-    std::vector<size_t> neg_dims_idx;
-    const int unknown_index = -1;  // only one dimension canbe set to -1, whose
-                                   // size will be automatically infered.
+    // only one dimension canbe set to -1, whose size will be automatically
+    // infered.
+    const int64_t unknown_index = -1;
+    const auto in_size = framework::product(input_dim);
+    const auto x_rank = input_dim.size();
 
+    bool need_dim_copy = false;
+    std::vector<size_t> neg_dims_idx;
     for (size_t i = 0; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 1 || shape[i] == unknown_index,
+      PADDLE_ENFORCE(shape[i] >= 0 || shape[i] == unknown_index,
                      "Each input dimension of Attr(shape) must be positive, or "
                      "only one input dimension can be -1.");
-      if (shape[i] == unknown_index) neg_dims_idx.push_back(i);
+      if (shape[i] == unknown_index) {
+        neg_dims_idx.push_back(i);
+      } else if (shape[i] == 0) {
+        PADDLE_ENFORCE_LT(
+            i, x_rank,
+            "Only dimension less than rank of Input(X) can be set to 0.");
+        need_dim_copy = true;
+      }
     }
     PADDLE_ENFORCE_LE(
         neg_dims_idx.size(), 1,
         "Only one input dimension of Attr(shape) may be unknown.");
 
+    output_shape.resize(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), output_shape.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+
+    // some dimension can only be determinted during runtime.
+    if (need_dim_copy) return need_dim_copy;
+
     int64_t inferred_dim = 0;
     if (neg_dims_idx.size()) {
       int64_t capacity = std::accumulate(shape.begin(), shape.end(), 1,
                                          std::multiplies<int>());
       inferred_dim = in_size / (-capacity);
+      PADDLE_ENFORCE_EQ(inferred_dim * (-capacity), in_size,
+                        "Invalid shape is given.");
+      output_shape[neg_dims_idx[0]] = inferred_dim;
     }
-
-    output_shape.resize(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), output_shape.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    if (neg_dims_idx.size()) output_shape[neg_dims_idx[0]] = inferred_dim;
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return false;
   }
 };
 
@@ -110,14 +109,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
   ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of reshape operator.");
-    AddInput(
-        "Shape",
-        "Tensor<int64_t>, a 1-D tensor that provides the shape information.")
-        .AsDispensable();
     AddOutput("Out", "The output tensor of reshape operator.");
     AddAttr<std::vector<int>>(
-        "shape", "(std::vector<int>) Target shape of reshape operator.")
-        .SetDefault(std::vector<int>());
+        "shape", "(std::vector<int>) Target shape of reshape operator.");
     AddAttr<bool>("inplace",
                   "Change the source tensor's shape without copy memory.")
         .SetDefault(true);
@@ -153,14 +147,6 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) shouldn't be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index 23fbf1655c..9dbc5cec6b 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -27,17 +27,8 @@ class ReshapeKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<framework::Tensor>("Out");
     auto* in = ctx.Input<framework::Tensor>("X");
 
-    auto* shape = ctx.Input<framework::Tensor>("Shape");
-    framework::DDim out_dims;
-    if (shape) {
-      std::vector<int64_t> output_shape;
-      ValidateShape(*shape, framework::product(in->dims()), output_shape);
-
-      out_dims = framework::make_ddim(output_shape);
-    } else {
-      out_dims = out->dims();
-    }
-
+    auto out_dims =
+        ValidateShape(ctx.Attr<std::vector<int>>("shape"), in->dims());
     bool inplace = ctx.Attr<bool>("inplace");
     if (!inplace) {
       out->mutable_data<T>(ctx.GetPlace());
@@ -50,35 +41,31 @@ class ReshapeKernel : public framework::OpKernel<T> {
   }
 
  private:
-  void ValidateShape(const framework::Tensor& shape, const int64_t in_size,
-                     std::vector<int64_t>& output_shape) const {
-    std::vector<size_t> neg_dims_idx;
-    const int unknown_index = -1;  // only one dimension canbe set to -1, whose
-                                   // size will be automatically infered.
-
-    const int64_t dimension = shape.dims()[1];
-    std::cout << "dimension =" << dimension << std::endl;
-    const T* shape_data = shape.data<T>();
-
-    for (int64_t i = 0; i < dimension; ++i) {
-      PADDLE_ENFORCE(shape_data[i] > 1 || shape_data[i] == unknown_index,
-                     "Each input dimension of Attr(shape) must be positive, or "
-                     "only one input dimension can be -1.");
-      if (shape_data[i] == unknown_index) neg_dims_idx.push_back(i);
-    }
-    PADDLE_ENFORCE_LE(
-        neg_dims_idx.size(), 1,
-        "Only one input dimension of Attr(shape) can be unknown.");
-
+  framework::DDim ValidateShape(const std::vector<int> shape_attr,
+                                const framework::DDim& in_dims) const {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension canbe set to -1, whose size will be automatically
+    // infered.
+    const int64_t unknown_index = -1;
+
+    std::vector<int64_t> output_shape(shape_attr.size(), 0);
     int64_t capacity = 1;
-    output_shape.resize(dimension, 0);
-    for (int64_t i = 0; i < dimension; ++i) {
-      capacity *= shape_data[i];
-      output_shape[i] = static_cast<int64_t>(shape_data[i]);
+    int neg_dim_idx = -1;
+    for (size_t i = 0; i < shape_attr.size(); ++i) {
+      if (shape_attr[i] == unknown_index) neg_dim_idx = i;
+      capacity *= (shape_attr[i] ? shape_attr[i] : in_dims[i]);
+      output_shape[i] =
+          (shape_attr[i] ? static_cast<int64_t>(shape_attr[i]) : in_dims[i]);
     }
 
-    if (neg_dims_idx.size())
-      output_shape[neg_dims_idx[0]] = in_size / (-capacity);
+    if (neg_dim_idx != -1) {
+      output_shape[neg_dim_idx] = -in_size / capacity;
+      PADDLE_ENFORCE_EQ(output_shape[neg_dim_idx] * capacity, -in_size,
+                        "Invalid shape is given.");
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
   }
 };
 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 2bf7cf21ca..d326c5651f 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -19,7 +19,6 @@ from layer_function_generator import generate_layer_fn
 from layer_function_generator import autodoc
 from ..layer_helper import LayerHelper
 import tensor
-import ops
 import nn
 import math
 
@@ -58,7 +57,7 @@ def detection_output(loc,
 
     This operation is to get the detection results by performing following
     two steps:
-    
+
     1. Decode input bounding box predictions according to the prior boxes.
     2. Get the final detection results by applying multi-class non maximum
        suppression (NMS).
@@ -458,7 +457,7 @@ def ssd_loss(location,
     num, num_prior, num_class = confidence.shape
 
     def __reshape_to_2d(var):
-        return ops.reshape(x=var, shape=[-1, var.shape[-1]])
+        return nn.reshape(x=var, shape=[-1, var.shape[-1]])
 
     # 1. Find matched boundding box by prior box.
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
@@ -469,7 +468,7 @@ def ssd_loss(location,
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
-    gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
     target_label, _ = target_assign(
         gt_label, matched_indices, mismatch_value=background_label)
     # 2.2. Compute confidence loss.
@@ -480,7 +479,7 @@ def ssd_loss(location,
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
 
     # 3. Mining hard examples
-    conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
     neg_indices = helper.create_tmp_variable(dtype='int32')
     dtype = matched_indices.dtype
     updated_matched_indices = helper.create_tmp_variable(dtype=dtype)
@@ -548,7 +547,7 @@ def ssd_loss(location,
     # 5.3 Compute overall weighted loss.
     loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
     # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    loss = ops.reshape(x=loss, shape=[-1, num_prior])
+    loss = nn.reshape(x=loss, shape=[-1, num_prior])
     loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
     if normalize:
         normalizer = nn.reduce_sum(target_loc_weight)
@@ -696,7 +695,7 @@ def multi_box_head(inputs,
         new_shape = [
             -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
         ]
-        out = ops.reshape(x=input, shape=new_shape)
+        out = nn.reshape(x=input, shape=new_shape)
         return out
 
     def _is_list_or_tuple_(data):
@@ -793,7 +792,7 @@ def multi_box_head(inputs,
             mbox_loc.shape[0],
             mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
         ]
-        mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
+        mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
         mbox_locs.append(mbox_loc_flatten)
 
         # get conf_loc
@@ -809,7 +808,7 @@ def multi_box_head(inputs,
             conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
             conf_loc.shape[3] / num_classes, num_classes
         ]
-        conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape)
+        conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
         mbox_confs.append(conf_loc_flatten)
 
     if len(box_results) == 1:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 10b0405f47..67a6fd8084 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -70,6 +70,7 @@ __all__ = [
     'smooth_l1',
     'one_hot',
     'autoincreased_step_counter',
+    'reshape',
 ]
 
 
@@ -3184,6 +3185,8 @@ def one_hot(input, depth):
          The one-hot tensor or LodTensor, same as input.
 
     Examples:
+        .. code-block:: python
+
         X is a LoDTensor:
           X.lod = [[0, 1, 4]]
           X.shape = [4, 1]
@@ -3236,3 +3239,56 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         counter.stop_gradient = True
 
     return counter
+
+
+def reshape(x, shape, act=None, inplace=True, name=None):
+    """
+    Gives a new shape to Tensor without changing its data.
+    This layer takes a tensor as input and the attribute shape specifying the
+    new shape. The shape attribute must be specified. At most one dimension of
+    the new shape can be -1. In this case, the value is inferred from the size
+    of the tensor and the remaining dimensions. A dimension could also be 0,
+    in which case the actual dimension value is going to be copied from the
+    input tensor.
+
+    Args:
+        input(variable): The input tensor.
+        shape(list): The new shape. At most one dimension of the new shape can
+                     be -1.
+        act (str): The non-linear activation to be applied to output variable.
+        inplace(bool): If this flag is set true, a new output tensor is created
+                       whose data is copied from input x, otherwise the output
+                       shares data with input without copying.
+
+    Returns(variable): The output tensor.
+
+    Examples:
+        .. code-block:: python
+
+        Given a 2-D tensor X with shape [2 x 2], and the new shape: [1, 4].
+        The reshape layer will change tensor X into a 2-D tensor with
+        shape [1 x 4] with its data unchanged.
+
+        Given a 3-D tensor x with shape [2, 3, 4] and the new shape: [3, -1].
+        The reshape layer will change tensor X into a 2-D tensor with shape:
+        [3 x 8] with its data unchanged.
+
+        Given a 3-D tensor x with shape [2, 3, 8] and the new shape:
+        [-1, 0, 2, 2]. The reshape layer will change tensor X into a 4-D tensor
+        with shape [4, 3, 2, 2] with its data unchanged.
+
+    """
+
+    if not (isinstance(shape, list) or isinstance(shape, tuple)):
+        raise ValueError("Input shape must be a python lsit or tuple.")
+
+    helper = LayerHelper("reshape", **locals())
+    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reshape",
+        inputs={"X": x},
+        attrs={"shape": shape,
+               "inplace": inplace},
+        outputs={"Out": reshaped})
+
+    return helper.append_activation(reshaped)
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 0b88b63962..20dd1b4752 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -47,7 +47,6 @@ __activations__ = [
 __all__ = [
     'mean',
     'mul',
-    'reshape',
     'scale',
     'sigmoid_cross_entropy_with_logits',
     'elementwise_add',
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index dc96aed8db..1a54427ab5 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -14,53 +14,88 @@
 
 import unittest
 import numpy as np
-import pdb
 
 from op_test import OpTest
 
-# class TestReshapeOp1(OpTest):
-#     def setUp(self):
-#         ori_shape = (2, 25)
-#         new_shape = [5, 10]
-# 
-#         self.op_type = "reshape"
-#         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-#         self.attrs = {"shape": new_shape}
-#         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-# 
-#     def test_check_output(self):
-#         self.check_output()
-# 
-#     def test_check_grad(self):
-#         self.check_grad(["X"], "Out")
-# 
-# 
-# class TestReshapeOpDimInfer1(OpTest):
-#     def setUp(self):
-#         self.op_type = "reshape"
-#         self.inputs = {"X": np.random.random((5, 10)).astype("float32")}
-#         self.attrs = {"shape": [5, -1, 5]}
-#         self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
-# 
-#     def test_check_output(self):
-#         self.check_output()
-# 
-#     def test_check_grad(self):
-#         self.check_grad(["X"], "Out")
-
-
-class TestReshapeOp2(OpTest):
+
+class TestReshapeOp(OpTest):
+    def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInfer1(OpTest):
+    def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInfer2(OpTest):
+    def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpInplace(OpTest):
     def setUp(self):
         ori_shape = (2, 25)
-        new_shape = ([5, 10], )
+        new_shape = (5, 10)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInferInplace1(OpTest):
+    def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
 
         self.op_type = "reshape"
-        self.inputs = {
-            "X": np.random.random(ori_shape).astype("float32"),
-            "Shape": np.array(
-                new_shape, dtype="int64")
-        }
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape[0])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -69,32 +104,23 @@ class TestReshapeOp2(OpTest):
         self.check_grad(["X"], "Out")
 
 
-# class TestReshapeOpInplace(OpTest):
-#     def setUp(self):
-#         self.op_type = "reshape"
-#         self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-#         self.attrs = {'shape': [10 * 20], 'inplace': True}
-#         self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
-# 
-#     def test_check_output(self):
-#         self.check_output()
-# 
-#     def test_check_grad(self):
-#         self.check_grad(["X"], "Out")
-# 
-# 
-# class TestReshapeOpDimInferInplace(OpTest):
-#     def setUp(self):
-#         self.op_type = "reshape"
-#         self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-#         self.attrs = {'shape': [4, -1, 5], 'inplace': True}
-#         self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
-# 
-#     def test_check_output(self):
-#         self.check_output()
-# 
-#     def test_check_grad(self):
-#         self.check_grad(["X"], "Out")
+class TestReshapeOpDimInferInplace2(OpTest):
+    def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
 
 if __name__ == "__main__":
     unittest.main()

From a6e64242d8f73f1a597f2a6634a98453cd07edf1 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Mon, 19 Mar 2018 11:08:33 +0800
Subject: [PATCH 04/57] follow comments.

---
 paddle/fluid/operators/reshape_op.cc    | 64 +++++++++++++++++--------
 paddle/fluid/operators/reshape_op.h     | 14 +++++-
 python/paddle/fluid/layers/detection.py |  4 +-
 python/paddle/fluid/layers/nn.py        | 52 ++++++++++++--------
 4 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index c0d08cc690..489742b492 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -44,22 +44,22 @@ class ReshapeOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim("Out", x_dims);
     } else {
       ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-
-      // FIXME(caoying): When shape of the output tensor is determined during
-      // runtime, LoD information of X will not passed to the output.
-      if (shape[0] == x_dims[0]) {
-        // Only pass LoD when the first dimension of output and Input(X)
-        // are the same.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
     }
+
+    // NOTE: Reshape op cannot reshape an input sequence batch into an output
+    // sequence batch that has a different number of time steps.
+    // Here output always shares the LoD information with input. But if
+    // Attr(shape) contains 0 or -1, the actual output shape can only be
+    // determined during runtime. The check for wheather it is a valid output
+    // sequence batch is performed in runtime.
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 
  private:
   bool ValidateShape(const std::vector<int> &shape,
                      const framework::DDim &input_dim,
                      std::vector<int64_t> &output_shape) const {
-    // only one dimension canbe set to -1, whose size will be automatically
+    // only one dimension can be set to -1, whose size will be automatically
     // infered.
     const int64_t unknown_index = -1;
     const auto in_size = framework::product(input_dim);
@@ -82,7 +82,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
     }
     PADDLE_ENFORCE_LE(
         neg_dims_idx.size(), 1,
-        "Only one input dimension of Attr(shape) may be unknown.");
+        "Only one input dimension of Attr(shape) can be unknown.");
 
     output_shape.resize(shape.size(), 0);
     std::transform(shape.begin(), shape.end(), output_shape.begin(),
@@ -113,22 +113,46 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>(
         "shape", "(std::vector<int>) Target shape of reshape operator.");
     AddAttr<bool>("inplace",
-                  "Change the source tensor's shape without copy memory.")
-        .SetDefault(true);
+                  "(default: false) Change the source tensor's shape without "
+                  "memory copy. When Attr(inplace) is set true, the output "
+                  "tensor shares memory with Input(X), otherwise, a new output "
+                  "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
-Reshape Input(X) into the shape specified by Attr(shape).
+Reshape Input(X) into the shape specified by Attr(shape). The data in Input(X)
+are unchanged.
+
+Examples:
+
+1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X)
+into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged.
+
+1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data
+unchanged. In this case, one and only dimension of Attr(shape) can be set to -1,
+the value of this dimension is inferred from the total element number of
+Input(X) and remaining dimensions.
+
+1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data
+unchanged. In this case, besides -1, 0 means the actual dimension value is going
+to be copied from the corresponding dimension of Input(X).
 
-An example:
-Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
+Note:
 
-and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
+1. One and only one dimension in Attr(shape) can be set -1. In this case,
+the actual dimension value will be infered from the total element number of
+Input(X) and remaining dimensions.
+1. More than one dimensions in Attr(shape) can be set to 0, which means the real
+dimension value will be copied from Input(X) at runtime. Note that the index of
+0 can not access Rank(X). For example, Input(X) is a 3-D tensor with shape
+[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
 
-One dimension in the target shape can be set -1, representing that its
-size is unknown. In this case, the real dimension will be infered from
-the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index 9dbc5cec6b..dd8eaf3e4f 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -24,11 +24,21 @@ template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("X");
 
     auto out_dims =
         ValidateShape(ctx.Attr<std::vector<int>>("shape"), in->dims());
+
+    if (!in->lod().empty()) {
+      PADDLE_ENFORCE_EQ(
+          out_dims[0], in->dims()[0],
+          "Reshape operator cannot reshape an input sequence batch "
+          "into an output sequence batch that has a different "
+          "number of time steps. Please consider using "
+          "sequence_reshape op.");
+    }
+
     bool inplace = ctx.Attr<bool>("inplace");
     if (!inplace) {
       out->mutable_data<T>(ctx.GetPlace());
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3ced35d6ce..ec4afa8067 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -130,9 +130,9 @@ def detection_output(loc,
         code_type='decode_center_size')
 
     old_shape = scores.shape
-    scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
+    scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
     scores = nn.softmax(input=scores)
-    scores = ops.reshape(x=scores, shape=old_shape)
+    scores = nn.reshape(x=scores, shape=old_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
 
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 48d244f3f6..85693578e1 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3299,13 +3299,35 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
 
 def reshape(x, shape, act=None, inplace=True, name=None):
     """
-    Gives a new shape to Tensor without changing its data.
-    This layer takes a tensor as input and the attribute shape specifying the
-    new shape. The shape attribute must be specified. At most one dimension of
-    the new shape can be -1. In this case, the value is inferred from the size
-    of the tensor and the remaining dimensions. A dimension could also be 0,
-    in which case the actual dimension value is going to be copied from the
-    input tensor.
+    Gives a new shape to the input Tensor without changing its data.
+
+    This layer takes a tensor and the attribute shape which specifies the
+    new shape as its inputs. The shape attribute must be given. It cannot be
+    empty. One and only one dimension of shape can be -1. More than one
+    dimension of shape can be 0.
+
+    -1 means the value of this dimension is inferred from the total element
+    number of x and remaining dimensions.
+
+    0 means the actual dimension value is going to be copied from the
+    corresponding dimension of x.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified by Attr(shape) is [6, 8], the reshape operator will transform x
+    into a 2-D tensor with shape [6, 8] and leaving x's data unchanged.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will
+    transform x into a 4-D tensor with shape [2, 3, 4, 2] and leaving x's data
+    unchanged. In this case, one and only dimension of Attr(shape) can be set
+    to -1, the value of this dimension is inferred from the total element number
+    of x and remaining dimensions.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will
+    transform x into a 4-D tensor with shape [2, 4, 3, 2] and leaving x's data
+    unchanged. In this case, besides -1, 0 means the actual dimension value is
+    going to be copied from the corresponding dimension of x during runtime.
 
     Args:
         input(variable): The input tensor.
@@ -3320,18 +3342,10 @@ def reshape(x, shape, act=None, inplace=True, name=None):
 
     Examples:
         .. code-block:: python
-
-        Given a 2-D tensor X with shape [2 x 2], and the new shape: [1, 4].
-        The reshape layer will change tensor X into a 2-D tensor with
-        shape [1 x 4] with its data unchanged.
-
-        Given a 3-D tensor x with shape [2, 3, 4] and the new shape: [3, -1].
-        The reshape layer will change tensor X into a 2-D tensor with shape:
-        [3 x 8] with its data unchanged.
-
-        Given a 3-D tensor x with shape [2, 3, 8] and the new shape:
-        [-1, 0, 2, 2]. The reshape layer will change tensor X into a 4-D tensor
-        with shape [4, 3, 2, 2] with its data unchanged.
+            data = fluid.layers.data(name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.reshape(
+                x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True
+            )
 
     """
 

From eb12cbe764a5e80cc8136fe6b96f6783f77ae474 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 21 Mar 2018 18:13:00 +0800
Subject: [PATCH 05/57] Refine reshape_op infershape

---
 paddle/fluid/operators/reshape_op.cc |  89 +-------------------
 paddle/fluid/operators/reshape_op.h  | 119 +++++++++++++++++++--------
 2 files changed, 84 insertions(+), 124 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 489742b492..ed153e7722 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -17,93 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(!shape.empty(),
-                   "The shape information must be set by Attr(shape).");
-
-    std::vector<int64_t> output_shape;
-    auto x_dims = ctx->GetInputDim("X");
-    bool need_copy_dim = ValidateShape(shape, x_dims, output_shape);
-
-    if (need_copy_dim) {
-      // Some dimensions can only be determined during runtime. Here temporarily
-      // set output tensor's shape the same as that of the input tensor.
-      ctx->SetOutputDim("Out", x_dims);
-    } else {
-      ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-    }
-
-    // NOTE: Reshape op cannot reshape an input sequence batch into an output
-    // sequence batch that has a different number of time steps.
-    // Here output always shares the LoD information with input. But if
-    // Attr(shape) contains 0 or -1, the actual output shape can only be
-    // determined during runtime. The check for wheather it is a valid output
-    // sequence batch is performed in runtime.
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- private:
-  bool ValidateShape(const std::vector<int> &shape,
-                     const framework::DDim &input_dim,
-                     std::vector<int64_t> &output_shape) const {
-    // only one dimension can be set to -1, whose size will be automatically
-    // infered.
-    const int64_t unknown_index = -1;
-    const auto in_size = framework::product(input_dim);
-    const auto x_rank = input_dim.size();
-
-    bool need_dim_copy = false;
-    std::vector<size_t> neg_dims_idx;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] >= 0 || shape[i] == unknown_index,
-                     "Each input dimension of Attr(shape) must be positive, or "
-                     "only one input dimension can be -1.");
-      if (shape[i] == unknown_index) {
-        neg_dims_idx.push_back(i);
-      } else if (shape[i] == 0) {
-        PADDLE_ENFORCE_LT(
-            i, x_rank,
-            "Only dimension less than rank of Input(X) can be set to 0.");
-        need_dim_copy = true;
-      }
-    }
-    PADDLE_ENFORCE_LE(
-        neg_dims_idx.size(), 1,
-        "Only one input dimension of Attr(shape) can be unknown.");
-
-    output_shape.resize(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), output_shape.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-
-    // some dimension can only be determinted during runtime.
-    if (need_dim_copy) return need_dim_copy;
-
-    int64_t inferred_dim = 0;
-    if (neg_dims_idx.size()) {
-      int64_t capacity = std::accumulate(shape.begin(), shape.end(), 1,
-                                         std::multiplies<int>());
-      inferred_dim = in_size / (-capacity);
-      PADDLE_ENFORCE_EQ(inferred_dim * (-capacity), in_size,
-                        "Invalid shape is given.");
-      output_shape[neg_dims_idx[0]] = inferred_dim;
-    }
-    return false;
-  }
-};
-
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
@@ -150,7 +63,7 @@ the actual dimension value will be infered from the total element number of
 Input(X) and remaining dimensions.
 1. More than one dimensions in Attr(shape) can be set to 0, which means the real
 dimension value will be copied from Input(X) at runtime. Note that the index of
-0 can not access Rank(X). For example, Input(X) is a 3-D tensor with shape
+0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape
 [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
 
 )DOC");
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index dd8eaf3e4f..db632577d7 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -20,15 +20,90 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    std::vector<int64_t> output_shape;
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    // NOTE: Reshape op cannot reshape an input sequence batch into an
+    // output sequence batch that has a different number of time steps. Here
+    // output always shares the LoD information with input. But if
+    // Attr(shape) contains 0 or -1, the actual output shape can only be
+    // determined during runtime. The check for wheather it is a valid
+    // output sequence batch is performed in runtime.
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+  static framework::DDim ValidateShape(const std::vector<int> shape,
+                                       const framework::DDim &in_dims) {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension canbe set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
+
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
+      }
+
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+    }
+
+    if (unk_dim_idx != -1) {
+      output_shape[unk_dim_idx] = -in_size / capacity;
+      PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                        "Invalid shape is given.");
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    auto* in = ctx.Input<framework::LoDTensor>("X");
+  void Compute(const framework::ExecutionContext &ctx) const {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
 
-    auto out_dims =
-        ValidateShape(ctx.Attr<std::vector<int>>("shape"), in->dims());
+    auto out_dims = ReshapeOp::ValidateShape(
+        ctx.Attr<std::vector<int>>("shape"), in->dims());
 
     if (!in->lod().empty()) {
       PADDLE_ENFORCE_EQ(
@@ -49,42 +124,14 @@ class ReshapeKernel : public framework::OpKernel<T> {
       out->Resize(out_dims);
     }
   }
-
- private:
-  framework::DDim ValidateShape(const std::vector<int> shape_attr,
-                                const framework::DDim& in_dims) const {
-    const int64_t in_size = framework::product(in_dims);
-    // only one dimension canbe set to -1, whose size will be automatically
-    // infered.
-    const int64_t unknown_index = -1;
-
-    std::vector<int64_t> output_shape(shape_attr.size(), 0);
-    int64_t capacity = 1;
-    int neg_dim_idx = -1;
-    for (size_t i = 0; i < shape_attr.size(); ++i) {
-      if (shape_attr[i] == unknown_index) neg_dim_idx = i;
-      capacity *= (shape_attr[i] ? shape_attr[i] : in_dims[i]);
-      output_shape[i] =
-          (shape_attr[i] ? static_cast<int64_t>(shape_attr[i]) : in_dims[i]);
-    }
-
-    if (neg_dim_idx != -1) {
-      output_shape[neg_dim_idx] = -in_size / capacity;
-      PADDLE_ENFORCE_EQ(output_shape[neg_dim_idx] * capacity, -in_size,
-                        "Invalid shape is given.");
-    } else {
-      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
-    }
-    return framework::make_ddim(output_shape);
-  }
 };
 
 template <typename DeviceContext, typename T>
 class ReshapeGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+  void Compute(const framework::ExecutionContext &ctx) const {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     d_x->mutable_data<T>(ctx.GetPlace());
     bool inplace = ctx.Attr<bool>("inplace");

From 454b0a96be7ff319a9ed05f45f23c513e70eb19f Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 21 Mar 2018 18:39:58 +0800
Subject: [PATCH 06/57] Remove the extra call of ValidateShape in ReshapeKernel

---
 paddle/fluid/operators/reshape_op.cc | 76 +++++++++++++++++++++++++++
 paddle/fluid/operators/reshape_op.h  | 78 +---------------------------
 2 files changed, 77 insertions(+), 77 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index ed153e7722..c817b35693 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -17,6 +17,82 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    std::vector<int64_t> output_shape;
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    // NOTE: Reshape op cannot reshape an input sequence batch into an
+    // output sequence batch that has a different number of time steps. Here
+    // output always shares the LoD information with input. But if
+    // Attr(shape) contains 0 or -1, the actual output shape can only be
+    // determined during runtime. The check for wheather it is a valid
+    // output sequence batch is performed in runtime.
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ private:
+  framework::DDim ValidateShape(const std::vector<int> shape,
+                                const framework::DDim &in_dims) const {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension canbe set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
+
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
+      }
+
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+    }
+
+    if (unk_dim_idx != -1) {
+      output_shape[unk_dim_idx] = -in_size / capacity;
+      PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                        "Invalid shape is given.");
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
+  }
+};
+
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index db632577d7..59adb5e87c 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -20,81 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(!shape.empty(),
-                   "The shape information must be set by Attr(shape).");
-
-    std::vector<int64_t> output_shape;
-    auto x_dims = ctx->GetInputDim("X");
-    auto out_dims = ValidateShape(shape, x_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    // NOTE: Reshape op cannot reshape an input sequence batch into an
-    // output sequence batch that has a different number of time steps. Here
-    // output always shares the LoD information with input. But if
-    // Attr(shape) contains 0 or -1, the actual output shape can only be
-    // determined during runtime. The check for wheather it is a valid
-    // output sequence batch is performed in runtime.
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
-  static framework::DDim ValidateShape(const std::vector<int> shape,
-                                       const framework::DDim &in_dims) {
-    const int64_t in_size = framework::product(in_dims);
-    // only one dimension canbe set to -1, whose size will be automatically
-    // infered.
-    const int64_t unk_dim_val = -1;
-    const int64_t copy_dim_val = 0;
-
-    std::vector<int64_t> output_shape(shape.size(), 0);
-    int64_t capacity = 1;
-    int unk_dim_idx = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      if (shape[i] == unk_dim_val) {
-        PADDLE_ENFORCE(
-            unk_dim_idx == -1,
-            "Only one input dimension of Attr(shape) can be unknown.");
-        unk_dim_idx = i;
-      } else if (shape[i] == copy_dim_val) {
-        PADDLE_ENFORCE(
-            static_cast<int>(i) < in_dims.size(),
-            "The index of dimension to copy from input shape must be less "
-            "than the size of input shape.");
-      } else {
-        PADDLE_ENFORCE(
-            shape[i] > 0,
-            "Each input dimension of Attr(shape) must not be negtive except "
-            "one unknown dimension.");
-      }
-
-      capacity *= (shape[i] ? shape[i] : in_dims[i]);
-      output_shape[i] =
-          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
-    }
-
-    if (unk_dim_idx != -1) {
-      output_shape[unk_dim_idx] = -in_size / capacity;
-      PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
-                        "Invalid shape is given.");
-    } else {
-      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
-    }
-    return framework::make_ddim(output_shape);
-  }
-};
-
 template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
@@ -102,8 +27,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     auto *in = ctx.Input<framework::LoDTensor>("X");
 
-    auto out_dims = ReshapeOp::ValidateShape(
-        ctx.Attr<std::vector<int>>("shape"), in->dims());
+    auto out_dims = out->dims();
 
     if (!in->lod().empty()) {
       PADDLE_ENFORCE_EQ(

From d4bb2ca71f72e31b78231e1bc0907330392ef759 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 22 Mar 2018 13:36:58 +0800
Subject: [PATCH 07/57] Follow comments and refine the python wrapper of
 reshape_op

---
 python/paddle/fluid/layers/nn.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index b4e3e83e3a..d98e1bdfca 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3361,7 +3361,9 @@ def reshape(x, shape, act=None, inplace=True, name=None):
 
     Examples:
         .. code-block:: python
-            data = fluid.layers.data(name='data', shape=[2, 4, 6], dtype='float32')
+            data = fluid.layers.data(
+                name='data', shape=[2, 4, 6], dtype='float32'
+            )
             reshaped = fluid.layers.reshape(
                 x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True
             )
@@ -3371,6 +3373,21 @@ def reshape(x, shape, act=None, inplace=True, name=None):
     if not (isinstance(shape, list) or isinstance(shape, tuple)):
         raise ValueError("Input shape must be a python lsit or tuple.")
 
+    # Validate the shape
+    unk_dim_idx = -1
+    for dim_idx, dim_size in enumerate(shape):
+        if dim_size == -1:
+            assert unk_dim_idx == -1, (
+                "Only one dimension in shape can be unknown.")
+            unk_dim_idx = dim_idx
+        elif dim_size == 0:
+            assert dim_idx < len(x.shape), (
+                "The indice of 0s in shape can not exceed Rank(X).")
+        else:
+            assert dim_size > 0, (
+                "Each dimension size given in shape must not be negtive "
+                "except one unknown dimension.")
+
     helper = LayerHelper("reshape", **locals())
     reshaped = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(

From c078ed4608c9dd4b43a73f21c6030097aeb1ae1c Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 28 Mar 2018 02:57:54 +0800
Subject: [PATCH 08/57] Enhance reshape_op by adding Input(Shape)

---
 paddle/fluid/operators/reshape_op.cc          | 101 ++++-------------
 paddle/fluid/operators/reshape_op.h           | 106 +++++++++++++++++-
 python/paddle/fluid/layers/nn.py              |  63 ++++++-----
 .../fluid/tests/unittests/test_reshape_op.py  |  22 ++++
 4 files changed, 184 insertions(+), 108 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index c817b35693..4b1aaf5849 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -17,88 +17,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(!shape.empty(),
-                   "The shape information must be set by Attr(shape).");
-
-    std::vector<int64_t> output_shape;
-    auto x_dims = ctx->GetInputDim("X");
-    auto out_dims = ValidateShape(shape, x_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    // NOTE: Reshape op cannot reshape an input sequence batch into an
-    // output sequence batch that has a different number of time steps. Here
-    // output always shares the LoD information with input. But if
-    // Attr(shape) contains 0 or -1, the actual output shape can only be
-    // determined during runtime. The check for wheather it is a valid
-    // output sequence batch is performed in runtime.
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- private:
-  framework::DDim ValidateShape(const std::vector<int> shape,
-                                const framework::DDim &in_dims) const {
-    const int64_t in_size = framework::product(in_dims);
-    // only one dimension canbe set to -1, whose size will be automatically
-    // infered.
-    const int64_t unk_dim_val = -1;
-    const int64_t copy_dim_val = 0;
-
-    std::vector<int64_t> output_shape(shape.size(), 0);
-    int64_t capacity = 1;
-    int unk_dim_idx = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      if (shape[i] == unk_dim_val) {
-        PADDLE_ENFORCE(
-            unk_dim_idx == -1,
-            "Only one input dimension of Attr(shape) can be unknown.");
-        unk_dim_idx = i;
-      } else if (shape[i] == copy_dim_val) {
-        PADDLE_ENFORCE(
-            static_cast<int>(i) < in_dims.size(),
-            "The index of dimension to copy from input shape must be less "
-            "than the size of input shape.");
-      } else {
-        PADDLE_ENFORCE(
-            shape[i] > 0,
-            "Each input dimension of Attr(shape) must not be negtive except "
-            "one unknown dimension.");
-      }
-
-      capacity *= (shape[i] ? shape[i] : in_dims[i]);
-      output_shape[i] =
-          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
-    }
-
-    if (unk_dim_idx != -1) {
-      output_shape[unk_dim_idx] = -in_size / capacity;
-      PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
-                        "Invalid shape is given.");
-    } else {
-      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
-    }
-    return framework::make_ddim(output_shape);
-  }
-};
-
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of reshape operator.");
-    AddOutput("Out", "The output tensor of reshape operator.");
+    AddInput("X", "(Tensor). The input tensor of reshape operator.");
+    AddInput("Shape",
+             "(Tensor<int32>, optional). If provided, reshape according to "
+             "this given shape. That is to say it has a higher priority than "
+             "the shape attribute, while the shape attribute still should be "
+             "set correctly to gurantee shape inference in compile time.")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
     AddAttr<std::vector<int>>(
         "shape", "(std::vector<int>) Target shape of reshape operator.");
     AddAttr<bool>("inplace",
@@ -110,8 +40,8 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Reshape Operator.
 
-Reshape Input(X) into the shape specified by Attr(shape). The data in Input(X)
-are unchanged.
+Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The
+data in Input(X) are unchanged.
 
 Examples:
 
@@ -141,6 +71,9 @@ Input(X) and remaining dimensions.
 dimension value will be copied from Input(X) at runtime. Note that the index of
 0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape
 [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
+1. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
+Attr(shape) still should be set correctly to gurantee shape inference in 
+compile-time.
 
 )DOC");
   }
@@ -160,6 +93,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) shouldn't be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index 59adb5e87c..3a9a769229 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -20,15 +20,115 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
+      // If true, set the shape of Output(Out) according to Input(Shape) in
+      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
+      ctx->ShareLoD("X", /*->*/ "Out");
+      return;
+    }
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+
+  static framework::DDim ValidateShape(const std::vector<int> shape,
+                                       const framework::DDim &in_dims) {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension canbe set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
+
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      // std::cout<< shape[i] << "haha";
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
+      }
+
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+    }
+
+    if (unk_dim_idx != -1) {
+      output_shape[unk_dim_idx] = -in_size / capacity;
+      PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                        "Invalid shape is given.");
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
 template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const {
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     auto *in = ctx.Input<framework::LoDTensor>("X");
+    auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
 
-    auto out_dims = out->dims();
-
+    framework::DDim out_dims = out->dims();
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor cpu_shape_tensor;
+        TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
+                   &cpu_shape_tensor);
+        shape_data = cpu_shape_tensor.data<int>();
+      }
+      auto shape =
+          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+    }
     if (!in->lod().empty()) {
       PADDLE_ENFORCE_EQ(
           out_dims[0], in->dims()[0],
@@ -39,9 +139,11 @@ class ReshapeKernel : public framework::OpKernel<T> {
     }
 
     bool inplace = ctx.Attr<bool>("inplace");
+    out->Resize(out_dims);
     if (!inplace) {
       out->mutable_data<T>(ctx.GetPlace());
       framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      // TensorCopy will resize to in_dims.
       out->Resize(out_dims);
     } else {
       out->ShareDataWith(*in);
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0e8354a4a0..098a629c89 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3320,42 +3320,54 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     return counter
 
 
-def reshape(x, shape, act=None, inplace=True, name=None):
+def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     """
     Gives a new shape to the input Tensor without changing its data.
 
-    This layer takes a tensor and the attribute shape which specifies the
-    new shape as its inputs. The shape attribute must be given. It cannot be
-    empty. One and only one dimension of shape can be -1. More than one
-    dimension of shape can be 0.
+    The target shape can be given by :attr:`shape` or :attr:`actual_shape`.
+    :attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor
+    variable. :attr:`actual_shape` has a higher priority than :attr:`shape`
+    if it is provided, while :attr:`shape` still should be set correctly to
+    gurantee shape inference in compile-time.
 
-    -1 means the value of this dimension is inferred from the total element
-    number of x and remaining dimensions.
+    Some tricks exist when specifying the target shape.
 
-    0 means the actual dimension value is going to be copied from the
-    corresponding dimension of x.
+    1. -1 means the value of this dimension is inferred from the total element
+    number of x and remaining dimensions. Thus one and only one dimension can
+    be set -1.
+
+    1. 0 means the actual dimension value is going to be copied from the
+    corresponding dimension of x. The indice of 0s in shape can not exceed
+    Rank(X).
+
+    Here are some examples to explain it.
 
     1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    specified by Attr(shape) is [6, 8], the reshape operator will transform x
-    into a 2-D tensor with shape [6, 8] and leaving x's data unchanged.
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with 
+    shape [6, 8] and leaving x's data unchanged.
 
     1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will
-    transform x into a 4-D tensor with shape [2, 3, 4, 2] and leaving x's data
-    unchanged. In this case, one and only dimension of Attr(shape) can be set
-    to -1, the value of this dimension is inferred from the total element number
-    of x and remaining dimensions.
+    specified is [2, 3, -1, 2], the reshape operator will transform x into a
+    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
+    case, one dimension of the target shape is set to -1, the value of this 
+    dimension is inferred from the total element number of x and remaining 
+    dimensions.
 
     1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will
-    transform x into a 4-D tensor with shape [2, 4, 3, 2] and leaving x's data
-    unchanged. In this case, besides -1, 0 means the actual dimension value is
-    going to be copied from the corresponding dimension of x during runtime.
+    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
+    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
+    besides -1, 0 means the actual dimension value is going to be copied from
+    the corresponding dimension of x.
 
     Args:
         input(variable): The input tensor.
         shape(list): The new shape. At most one dimension of the new shape can
                      be -1.
+        actual_shape(variable): An optional input. If provided, reshape
+                                according to this given shape rather than
+                                :attr:`shape` specifying shape. That is to
+                                say :attr:`actual_shape` has a higher priority
+                                than :attr:`shape`.
         act (str): The non-linear activation to be applied to output variable.
         inplace(bool): If this flag is set true, a new output tensor is created
                        whose data is copied from input x, otherwise the output
@@ -3366,12 +3378,9 @@ def reshape(x, shape, act=None, inplace=True, name=None):
     Examples:
         .. code-block:: python
             data = fluid.layers.data(
-                name='data', shape=[2, 4, 6], dtype='float32'
-            )
+                name='data', shape=[2, 4, 6], dtype='float32')
             reshaped = fluid.layers.reshape(
-                x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True
-            )
-
+                x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True)
     """
 
     if not (isinstance(shape, list) or isinstance(shape, tuple)):
@@ -3396,7 +3405,9 @@ def reshape(x, shape, act=None, inplace=True, name=None):
     reshaped = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
         type="reshape",
-        inputs={"X": x},
+        inputs={"X": x,
+                "Shape": actual_shape}
+        if isinstance(actual_shape, Variable) else {"X": x},
         attrs={"shape": shape,
                "inplace": inplace},
         outputs={"Out": reshaped})
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 1a54427ab5..88c9933da3 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -122,5 +122,27 @@ class TestReshapeOpDimInferInplace2(OpTest):
         self.check_grad(["X"], "Out")
 
 
+class TestReshapeOpWithInputShape(OpTest):
+    def setUp(self):
+        ori_shape = (6, 5)
+        new_shape = (0, -1, 5)
+        actual_shape = (2, 3, 5)
+
+        self.op_type = "reshape"
+        self.inputs = {
+            "X": np.random.random(ori_shape).astype("float32"),
+            "Shape": np.array(
+                actual_shape, dtype="int32")
+        }
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    # def test_check_grad(self):
+    #     self.check_grad(["X"], "Out")
+
+
 if __name__ == "__main__":
     unittest.main()

From 09743b61170718c7de8681cef813e93d816e53af Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 28 Mar 2018 13:36:59 +0800
Subject: [PATCH 09/57] Refine test_reshape_op

---
 python/paddle/fluid/tests/unittests/test_reshape_op.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 88c9933da3..f51b5a7e99 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -140,8 +140,8 @@ class TestReshapeOpWithInputShape(OpTest):
     def test_check_output(self):
         self.check_output()
 
-    # def test_check_grad(self):
-    #     self.check_grad(["X"], "Out")
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
 
 
 if __name__ == "__main__":

From 5b8bb3447006acabbc663dd9eb960560d78adca0 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 29 Mar 2018 16:24:39 +0800
Subject: [PATCH 10/57] Refine reshape_op by following comments.

---
 paddle/fluid/operators/reshape_op.cc | 10 ++++++----
 paddle/fluid/operators/reshape_op.h  |  1 -
 python/paddle/fluid/layers/nn.py     |  6 +++---
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 4b1aaf5849..b87b8e6b26 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -49,14 +49,14 @@ Examples:
 specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X)
 into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged.
 
-1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
 specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform
 Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data
 unchanged. In this case, one and only dimension of Attr(shape) can be set to -1,
 the value of this dimension is inferred from the total element number of
 Input(X) and remaining dimensions.
 
-1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
 specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform
 Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data
 unchanged. In this case, besides -1, 0 means the actual dimension value is going
@@ -67,11 +67,13 @@ Note:
 1. One and only one dimension in Attr(shape) can be set -1. In this case,
 the actual dimension value will be infered from the total element number of
 Input(X) and remaining dimensions.
-1. More than one dimensions in Attr(shape) can be set to 0, which means the real
+
+2. More than one dimensions in Attr(shape) can be set to 0, which means the real
 dimension value will be copied from Input(X) at runtime. Note that the index of
 0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape
 [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
-1. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
+
+3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
 Attr(shape) still should be set correctly to gurantee shape inference in 
 compile-time.
 
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index 3a9a769229..871b4d38d5 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -66,7 +66,6 @@ class ReshapeOp : public framework::OperatorWithKernel {
     int64_t capacity = 1;
     int unk_dim_idx = -1;
     for (size_t i = 0; i < shape.size(); ++i) {
-      // std::cout<< shape[i] << "haha";
       if (shape[i] == unk_dim_val) {
         PADDLE_ENFORCE(
             unk_dim_idx == -1,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c2d32954b5..ed82fa8940 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3337,7 +3337,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     number of x and remaining dimensions. Thus one and only one dimension can
     be set -1.
 
-    1. 0 means the actual dimension value is going to be copied from the
+    2. 0 means the actual dimension value is going to be copied from the
     corresponding dimension of x. The indice of 0s in shape can not exceed
     Rank(X).
 
@@ -3347,14 +3347,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     is [6, 8], the reshape operator will transform x into a 2-D tensor with 
     shape [6, 8] and leaving x's data unchanged.
 
-    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
     specified is [2, 3, -1, 2], the reshape operator will transform x into a
     4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
     case, one dimension of the target shape is set to -1, the value of this 
     dimension is inferred from the total element number of x and remaining 
     dimensions.
 
-    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
     is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
     with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
     besides -1, 0 means the actual dimension value is going to be copied from

From 53fa7cb9ccd17ce2e7ce0245a4733fbe73bef725 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 30 Mar 2018 17:38:02 +0800
Subject: [PATCH 11/57] Add local cache of double buffer reader

---
 .../reader/create_double_buffer_reader_op.cc  | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 141a3eb935..f4b10cb032 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -128,9 +128,6 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
     PADDLE_THROW("There is no next data!");
   }
 
-  if (local_buffer_.payloads_.empty()) {
-    buffer_->Receive(&local_buffer_);
-  }
   *out = local_buffer_.payloads_;
   local_buffer_.payloads_.clear();
   if (local_buffer_.ctx_) {
@@ -149,21 +146,30 @@ void DoubleBufferReader::ReInit() {
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
   size_t gpu_ctx_offset = 0;
+  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(4);
+  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(4);
+  size_t tensor_cache_id = 0;
+
   while (reader_->HasNext()) {
     Item batch;
     reader_->ReadNext(&batch.payloads_);
     if (platform::is_gpu_place(place_)) {
-      std::vector<framework::LoDTensor> gpu_batch;
+      tensor_cache_id %= 4;
+      auto& gpu_batch = gpu_tensor_cache[tensor_cache_id];
+      auto& cpu_batch = cpu_tensor_cache[tensor_cache_id];
+      cpu_batch = batch.payloads_;
+      ++tensor_cache_id;
+
       auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++];
       gpu_ctx_offset %= this->ctxs_.size();
+
       gpu_batch.resize(batch.payloads_.size());
-      for (size_t i = 0; i < batch.payloads_.size(); ++i) {
-        framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx,
-                              &gpu_batch[i]);
+      for (size_t i = 0; i < cpu_batch.size(); ++i) {
+        framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
         gpu_batch[i].set_lod(batch.payloads_[i].lod());
       }
       batch.ctx_ = gpu_ctx.get();
-      std::swap(gpu_batch, batch.payloads_);
+      batch.payloads_ = gpu_batch;
     }
 
     try {

From c3580eae4656a2ae66112b2ea372291e4c6d5b4c Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 30 Mar 2018 17:56:56 +0800
Subject: [PATCH 12/57] Add prefetch interface on server side

---
 paddle/fluid/operators/detail/CMakeLists.txt  |  3 +-
 paddle/fluid/operators/detail/grpc_client.cc  |  3 +-
 paddle/fluid/operators/detail/grpc_server.cc  | 61 ++++++++++++++++++-
 paddle/fluid/operators/detail/grpc_server.h   | 15 +++++
 .../operators/detail/grpc_server_test.cc      | 51 ++++++++++++++++
 paddle/fluid/operators/detail/grpc_service.h  |  3 +
 paddle/fluid/operators/detail/send_recv.proto |  2 +
 paddle/fluid/platform/profiler_test.cc        |  4 ++
 8 files changed, 139 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/operators/detail/grpc_server_test.cc

diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index 2b19f04489..997309325c 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -2,7 +2,8 @@ if(WITH_DISTRIBUTE)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
       grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(test_serde.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
       cares zlib protobuf sendrecvop_grpc)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
 endif()
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 9652bb888b..ba9882ce24 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -150,7 +150,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
     s->response_call_back_ = ProcGetResponse;
 
     auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_);
+        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
+        &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, (void*)s);
   });
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 9691d1e86b..26bef375cb 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -128,6 +128,47 @@ class RequestGet final : public RequestBase {
   SimpleBlockQueue<MessageWithName>* queue_;
 };
 
+class RequestPrefetch final : public RequestBase {
+ public:
+  explicit RequestPrefetch(GrpcService::AsyncService* service,
+                           ::grpc::ServerCompletionQueue* cq,
+                           framework::Scope* scope,
+                           const platform::DeviceContext* dev_ctx,
+                           framework::Executor* executor,
+                           framework::ProgramDesc* program, int blkid)
+      : RequestBase(service, cq, dev_ctx),
+        responder_(&ctx_),
+        scope_(scope),
+        executor_(executor),
+        program_(program),
+        blkid_(blkid) {
+    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
+    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
+                                cq_, this);
+  }
+
+  virtual ~RequestPrefetch() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    // prefetch process...
+    ::grpc::ByteBuffer relay;
+    // TODO(Yancey1989): execute the Block which containers prefetch ops
+
+    responder_.Finish(relay, ::grpc::Status::OK, this);
+    status_ = FINISH;
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  framework::Scope* scope_;
+  framework::Executor* executor_;
+  framework::ProgramDesc* program_;
+  int blkid_;
+};
+
 void AsyncGRPCServer::WaitClientGet(int count) {
   int fetch_barriers = 0;
   while (fetch_barriers < count) {
@@ -147,6 +188,7 @@ void AsyncGRPCServer::RunSyncUpdate() {
 
   cq_send_ = builder.AddCompletionQueue();
   cq_get_ = builder.AddCompletionQueue();
+  cq_prefetch_ = builder.AddCompletionQueue();
 
   server_ = builder.BuildAndStart();
   LOG(INFO) << "Server listening on " << address_ << std::endl;
@@ -155,6 +197,8 @@ void AsyncGRPCServer::RunSyncUpdate() {
       std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
   std::function<void()> get_register =
       std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+  std::function<void()> prefetch_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
 
   t_send_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
@@ -163,11 +207,14 @@ void AsyncGRPCServer::RunSyncUpdate() {
   t_get_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_get_.get(), "cq_get", get_register)));
-
+  t_prefetch_.reset(new std::thread(
+      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
+                "cq_prefetch", prefetch_register)));
   // wait server
   server_->Wait();
   t_send_->join();
   t_get_->join();
+  t_prefetch_->join();
 }
 
 void AsyncGRPCServer::ShutdownQueue() {
@@ -203,6 +250,18 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
   VLOG(4) << "Create RequestGet status:" << get->Status();
 }
 
+void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestPrefetch* prefetch =
+      new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
+                          executor_, program_, prefetch_blk_id_);
+
+  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+}
+
 // FIXME(typhoonzero): change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
                                     std::string cq_name,
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 10e6dd45a9..dd5cf4b377 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -17,7 +17,9 @@ limitations under the License. */
 #include <grpc++/grpc++.h>
 #include <thread>
 
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -53,6 +55,12 @@ class AsyncGRPCServer final {
 
   void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
 
+  void SetProgram(framework::ProgramDesc *program) { program_ = program; }
+
+  void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; }
+
+  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
+
   const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
 
   void Push(const std::string &msg_name) {
@@ -66,6 +74,7 @@ class AsyncGRPCServer final {
                      std::function<void()> TryToRegisterNewOne);
   void TryToRegisterNewSendOne();
   void TryToRegisterNewGetOne();
+  void TryToRegisterNewPrefetchOne();
   void ShutdownQueue();
 
  private:
@@ -73,6 +82,7 @@ class AsyncGRPCServer final {
   volatile bool is_shut_down_ = false;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
 
   GrpcService::AsyncService service_;
   std::unique_ptr<::grpc::Server> server_;
@@ -92,6 +102,11 @@ class AsyncGRPCServer final {
 
   std::unique_ptr<std::thread> t_send_;
   std::unique_ptr<std::thread> t_get_;
+  std::unique_ptr<std::thread> t_prefetch_;
+
+  int prefetch_blk_id_;
+  framework::ProgramDesc *program_;
+  framework::Executor *executor_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
new file mode 100644
index 0000000000..5773748106
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace detail = paddle::operators::detail;
+
+std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+
+void StartServer(const std::string& endpoint) {
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+}
+
+TEST(PREFETCH, CPU) {
+  // start up a server instance backend
+  // TODO(Yancey1989): Need to start a server with optimize blocks and
+  // prefetch blocks.
+  std::thread server_thread(StartServer, "127.0.0.1:8889");
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // create var on local scope
+  std::string var_name("tmp_0");
+  auto var = scope.Var(var_name);
+  auto tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize({10, 10});
+
+  detail::RPCClient client;
+  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, var_name, "");
+  server_thread.join();
+  rpc_service_.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index ae6f9db3bd..879e21933b 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -76,6 +76,7 @@ namespace detail {
 enum class GrpcMethod {
   kSendVariable,
   kGetVariable,
+  kPrefetchVariable,
 };
 
 static const int kGrpcNumMethods =
@@ -87,6 +88,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kPrefetchVariable:
+      return "/sendrecv.SendREcvService/PrefetchVariable";
   }
 
   // Shouldn't be reached.
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index 2d33f026e4..fc12e82a7e 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -21,6 +21,8 @@ service SendRecvService {
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
   // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // Prefetch variable by Ids
+  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 366c82bf96..45cc271bb8 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_CUDA
 #include "cuda_runtime.h"
+#endif
 #include "gtest/gtest.h"
 
 TEST(Event, CpuElapsedTime) {
@@ -159,6 +161,7 @@ TEST(RecordEvent, RecordEvent) {
   DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
 }
 
+#ifdef PADDLE_WITH_CUDA
 TEST(TMP, stream_wait) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
@@ -166,3 +169,4 @@ TEST(TMP, stream_wait) {
   cudaStreamSynchronize(stream);
   cudaStreamSynchronize(stream);
 }
+#endif

From 7bb18433fd34a43ac46b0b134284b8d516c6ece0 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sat, 31 Mar 2018 01:08:32 +0800
Subject: [PATCH 13/57] refine code

---
 .../reader/create_double_buffer_reader_op.cc  | 88 +++++++++----------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index f4b10cb032..1b7df87b35 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -20,7 +20,8 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-static constexpr size_t kDoubleBufferSize = 2;
+static constexpr size_t kChannelSize = 2;
+static constexpr size_t kCacheSize = 4;  // kChannelSize + 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
@@ -34,33 +35,36 @@ class DoubleBufferReader : public framework::DecoratedReader {
   explicit DoubleBufferReader(
       ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
       : DecoratedReader(reader), place_(target_place) {
-    for (size_t i = 0; i < kDoubleBufferSize; ++i) {
-      if (platform::is_gpu_place(place_)) {
 #ifdef PADDLE_WITH_CUDA
+    for (size_t i = 0; i < kChannelSize + 2; ++i) {
+      if (platform::is_gpu_place(place_)) {
         ctxs_.emplace_back(new platform::CUDADeviceContext(
             boost::get<platform::CUDAPlace>(place_)));
-#endif
       }
     }
-
-    start_thread();
-  }
-
-  void start_thread() {
-    buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
-    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
+#endif
+    StartPrefetcher();
   }
 
+  bool HasNext() const override;
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   void ReInit() override;
 
-  ~DoubleBufferReader() {
+  void StartPrefetcher() {
+    buffer_ = framework::MakeChannel<Item>(kChannelSize);
+    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
+  }
+
+  void EndPrefetcher() {
     buffer_->Close();
-    prefetcher_.join();
+    if (prefecther_.joinable()) {
+      prefetcher_.join();
+    }
     delete buffer_;
+    buffer_ = nullptr;
   }
 
-  bool HasNext() const override;
+  ~DoubleBufferReader() { EndPrefetcher(); }
 
  private:
   void PrefetchThreadFunc();
@@ -123,6 +127,15 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
+bool DoubleBufferReader::HasNext() const {
+  if (local_buffer_.payloads_.empty()) {
+    bool ok = buffer_->Receive(&local_buffer_);
+    return ok;
+  } else {
+    return true;
+  }
+}
+
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   if (!HasNext()) {
     PADDLE_THROW("There is no next data!");
@@ -137,40 +150,36 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
 
 void DoubleBufferReader::ReInit() {
   reader_->ReInit();
-  buffer_->Close();
-  prefetcher_.join();
-  delete buffer_;
-  start_thread();
+  EndPrefetcher();
+  StartPrefetcher();
 }
 
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
-  size_t gpu_ctx_offset = 0;
-  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(4);
-  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(4);
-  size_t tensor_cache_id = 0;
+  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
+  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
+  size_t cached_tensor_id = 0;
 
   while (reader_->HasNext()) {
     Item batch;
-    reader_->ReadNext(&batch.payloads_);
+    auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
+    reader_->ReadNext(&cpu_batch);
     if (platform::is_gpu_place(place_)) {
-      tensor_cache_id %= 4;
-      auto& gpu_batch = gpu_tensor_cache[tensor_cache_id];
-      auto& cpu_batch = cpu_tensor_cache[tensor_cache_id];
-      cpu_batch = batch.payloads_;
-      ++tensor_cache_id;
-
-      auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++];
-      gpu_ctx_offset %= this->ctxs_.size();
-
-      gpu_batch.resize(batch.payloads_.size());
+      auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
+      auto* gpu_ctx = ctxs_[cached_tensor_id].get();
+      gpu_batch.resize(cpu_batch.size());
       for (size_t i = 0; i < cpu_batch.size(); ++i) {
         framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
         gpu_batch[i].set_lod(batch.payloads_[i].lod());
       }
-      batch.ctx_ = gpu_ctx.get();
-      batch.payloads_ = gpu_batch;
+      batch.payload_ = gpu_batch;
+      batch.ctx_ = gpu_ctx;
+    } else {
+      // CPUPlace
+      batch.payload_ = cpu_batch;
     }
+    ++cached_tensor_id;
+    cached_tensor_id %= kCacheSize;
 
     try {
       buffer_->Send(&batch);
@@ -184,15 +193,6 @@ void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "Prefetch thread terminates.";
 }
 
-bool DoubleBufferReader::HasNext() const {
-  if (local_buffer_.payloads_.empty()) {
-    bool ok = buffer_->Receive(&local_buffer_);
-    return ok;
-  } else {
-    return true;
-  }
-}
-
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle

From 55e4b89f1482a885da2bec1d10e27dcaaf0b432e Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sat, 31 Mar 2018 01:36:25 +0800
Subject: [PATCH 14/57] remove local_buffer_

---
 .../reader/create_double_buffer_reader_op.cc    | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 1b7df87b35..788f7582ae 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -73,7 +73,6 @@ class DoubleBufferReader : public framework::DecoratedReader {
   framework::Channel<Item>* buffer_;
   platform::Place place_;
   std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
-  mutable Item local_buffer_;
 };
 
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
@@ -128,12 +127,9 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
 };
 
 bool DoubleBufferReader::HasNext() const {
-  if (local_buffer_.payloads_.empty()) {
-    bool ok = buffer_->Receive(&local_buffer_);
-    return ok;
-  } else {
-    return true;
+  while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
   }
+  return buffer_->CanReceive()
 }
 
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
@@ -141,10 +137,11 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
     PADDLE_THROW("There is no next data!");
   }
 
-  *out = local_buffer_.payloads_;
-  local_buffer_.payloads_.clear();
-  if (local_buffer_.ctx_) {
-    local_buffer_.ctx_->Wait();
+  Item batch;
+  buffer_->Receive(&batch);
+  *out = batch.payload_;
+  if (batch.ctx_) {
+    batch.ctx_->Wait();
   }
 }
 

From a469666e42ebf6f6c19e26036531a9336e49a3b2 Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Fri, 30 Mar 2018 18:44:25 +0000
Subject: [PATCH 15/57] fix compile errors

---
 .../reader/create_double_buffer_reader_op.cc  | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 788f7582ae..3f0f449248 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -20,8 +20,8 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-static constexpr size_t kChannelSize = 2;
-static constexpr size_t kCacheSize = 4;  // kChannelSize + 2
+static constexpr size_t kCacheSize = 2;
+static constexpr size_t kChannelSize = 0;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
@@ -36,7 +36,7 @@ class DoubleBufferReader : public framework::DecoratedReader {
       ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
       : DecoratedReader(reader), place_(target_place) {
 #ifdef PADDLE_WITH_CUDA
-    for (size_t i = 0; i < kChannelSize + 2; ++i) {
+    for (size_t i = 0; i < kCacheSize; ++i) {
       if (platform::is_gpu_place(place_)) {
         ctxs_.emplace_back(new platform::CUDADeviceContext(
             boost::get<platform::CUDAPlace>(place_)));
@@ -51,17 +51,17 @@ class DoubleBufferReader : public framework::DecoratedReader {
   void ReInit() override;
 
   void StartPrefetcher() {
-    buffer_ = framework::MakeChannel<Item>(kChannelSize);
+    channel_ = framework::MakeChannel<Item>(kChannelSize);
     prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
   }
 
   void EndPrefetcher() {
-    buffer_->Close();
-    if (prefecther_.joinable()) {
+    channel_->Close();
+    if (prefetcher_.joinable()) {
       prefetcher_.join();
     }
-    delete buffer_;
-    buffer_ = nullptr;
+    delete channel_;
+    channel_ = nullptr;
   }
 
   ~DoubleBufferReader() { EndPrefetcher(); }
@@ -70,7 +70,7 @@ class DoubleBufferReader : public framework::DecoratedReader {
   void PrefetchThreadFunc();
 
   std::thread prefetcher_;
-  framework::Channel<Item>* buffer_;
+  framework::Channel<Item>* channel_;
   platform::Place place_;
   std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
 };
@@ -127,9 +127,9 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
 };
 
 bool DoubleBufferReader::HasNext() const {
-  while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
+  while (!channel_->IsClosed() && !channel_->CanReceive()) {
   }
-  return buffer_->CanReceive()
+  return channel_->CanReceive();
 }
 
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
@@ -138,8 +138,8 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   }
 
   Item batch;
-  buffer_->Receive(&batch);
-  *out = batch.payload_;
+  channel_->Receive(&batch);
+  *out = batch.payloads_;
   if (batch.ctx_) {
     batch.ctx_->Wait();
   }
@@ -167,26 +167,26 @@ void DoubleBufferReader::PrefetchThreadFunc() {
       gpu_batch.resize(cpu_batch.size());
       for (size_t i = 0; i < cpu_batch.size(); ++i) {
         framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
-        gpu_batch[i].set_lod(batch.payloads_[i].lod());
+        gpu_batch[i].set_lod(cpu_batch[i].lod());
       }
-      batch.payload_ = gpu_batch;
+      batch.payloads_ = gpu_batch;
       batch.ctx_ = gpu_ctx;
     } else {
       // CPUPlace
-      batch.payload_ = cpu_batch;
+      batch.payloads_ = cpu_batch;
     }
     ++cached_tensor_id;
     cached_tensor_id %= kCacheSize;
 
     try {
-      buffer_->Send(&batch);
+      channel_->Send(&batch);
     } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                  "prefetch thread will terminate.";
       break;
     }
   }
-  buffer_->Close();
+  channel_->Close();
   VLOG(5) << "Prefetch thread terminates.";
 }
 

From 3a5bce775e90882c21778334420a9b597c2de583 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sat, 31 Mar 2018 09:20:14 +0800
Subject: [PATCH 16/57] try to complete

---
 paddle/fluid/operators/detail/grpc_server.cc  |  8 +++++--
 .../operators/detail/grpc_server_test.cc      | 21 ++++++++++++++-----
 paddle/fluid/operators/detail/grpc_service.h  |  2 +-
 paddle/fluid/operators/listen_and_serv_op.cc  |  4 ++++
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 26bef375cb..407fa5ef5a 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detail/grpc_server.h"
+#include <paddle/fluid/operators/detail/send_recv.pb.h>
 
 using ::grpc::ServerAsyncResponseWriter;
 
@@ -156,6 +157,8 @@ class RequestPrefetch final : public RequestBase {
     ::grpc::ByteBuffer relay;
     // TODO(Yancey1989): execute the Block which containers prefetch ops
 
+    VLOG(3) << "RequestPrefetch Process in";
+
     responder_.Finish(relay, ::grpc::Status::OK, this);
     status_ = FINISH;
   }
@@ -251,6 +254,7 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
 }
 
 void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
+  VLOG(4) << "TryToRegisterNewPrefetchOne in";
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
     return;
@@ -287,8 +291,8 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
     // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
     if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name"
-                   << base->GetReqName();
+      LOG(WARNING) << cq_name << " recv no regular event:argument name["
+                   << base->GetReqName() << "]";
       TryToRegisterNewOne();
       delete base;
       continue;
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
index 5773748106..1ad62863a1 100644
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -28,6 +28,7 @@ std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
 
 void StartServer(const std::string& endpoint) {
   rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+  rpc_service_->RunSyncUpdate();
 }
 
 TEST(PREFETCH, CPU) {
@@ -39,13 +40,23 @@ TEST(PREFETCH, CPU) {
   platform::CPUPlace place;
   platform::CPUDeviceContext ctx(place);
   // create var on local scope
-  std::string var_name("tmp_0");
-  auto var = scope.Var(var_name);
-  auto tensor = var->GetMutable<framework::LoDTensor>();
-  tensor->Resize({10, 10});
+  std::string in_var_name("in");
+  std::string out_var_name("out");
+  auto* in_var = scope.Var(in_var_name);
+  auto* in_tensor = in_var->GetMutable<framework::LoDTensor>();
+  in_tensor->Resize({10, 10});
+  VLOG(3) << "before mutable_data";
+  in_tensor->mutable_data<int>(place);
 
+  scope.Var(out_var_name);
+
+  VLOG(3) << "before fetch";
   detail::RPCClient client;
-  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, var_name, "");
+  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
+                               out_var_name);
+  client.Wait();
+
+  rpc_service_->ShutDown();
   server_thread.join();
   rpc_service_.reset(nullptr);
 }
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index 879e21933b..1ec8cf11c5 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -80,7 +80,7 @@ enum class GrpcMethod {
 };
 
 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kGetVariable) + 1;
+    static_cast<int>(GrpcMethod::kPrefetchVariable) + 1;
 
 inline const char* GrpcMethodName(GrpcMethod id) {
   switch (id) {
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index d5eae2be79..c9455fd35c 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -112,6 +112,10 @@ class ListenAndServOp : public framework::OperatorBase {
 
     framework::Executor executor(dev_place);
 
+    rpc_service_->SetExecutor(&executor);
+    rpc_service_->SetPrefetchBlkdId(0);
+    rpc_service_->SetProgram(program);
+
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
     bool exit_flag = false;
     // Record received sparse variables, so that

From 5aa440fd7a5a6bff32fc628a6907e16cb6feb8a9 Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Sat, 31 Mar 2018 05:02:19 +0000
Subject: [PATCH 17/57] Add move constructor for Item

---
 .../operators/reader/create_double_buffer_reader_op.cc   | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 3f0f449248..f15747e266 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -27,6 +27,15 @@ class DoubleBufferReader : public framework::DecoratedReader {
  public:
   struct Item {
     Item() : ctx_(nullptr) {}
+    Item(Item&& b) {
+      payloads_ = std::move(b.payloads_);
+      ctx_ = std::move(b.ctx_);
+    }
+    Item& operator=(Item&& b) {
+      payloads_ = std::move(b.payloads_);
+      ctx_ = std::move(b.ctx_);
+      return *this;
+    }
 
     std::vector<framework::LoDTensor> payloads_;
     platform::DeviceContext* ctx_;

From c0257f0a5b315bb39f2c3e92c5afe43d631eae69 Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Sat, 31 Mar 2018 05:17:57 +0000
Subject: [PATCH 18/57] Add comments

---
 .../operators/reader/create_double_buffer_reader_op.cc     | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index f15747e266..3f1d36a3e6 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -20,7 +20,14 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
+// 'Double buffer' means we shall maintain two batch of input data at the same
+// time. So the kCacheSize shoul be at least 2.
 static constexpr size_t kCacheSize = 2;
+// There will be two bacthes out of the channel during training:
+// 1. the one waiting to be sent to the channel
+// 2. the one just be received from the channel, which is also being used by
+// subsequent operators.
+// So the channel size should be kChacheSize - 2
 static constexpr size_t kChannelSize = 0;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {

From 597c845c998a176610ebd83f14a6215008b29f38 Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Sat, 31 Mar 2018 05:21:59 +0000
Subject: [PATCH 19/57] fix typo

---
 paddle/fluid/operators/reader/create_double_buffer_reader_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 3f1d36a3e6..342cd2a549 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-// 'Double buffer' means we shall maintain two batch of input data at the same
+// 'Double buffer' means we shall maintain two batches of input data at the same
 // time. So the kCacheSize shoul be at least 2.
 static constexpr size_t kCacheSize = 2;
 // There will be two bacthes out of the channel during training:

From 453630692e439451b42a2501c2d74f7a011ad14d Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sun, 1 Apr 2018 23:33:07 +0800
Subject: [PATCH 20/57] fix prefetch hang problem, add some more logs

---
 paddle/fluid/operators/detail/grpc_client.cc | 16 +++++++++-------
 paddle/fluid/operators/detail/grpc_server.cc | 13 +++++++++++--
 paddle/fluid/operators/detail/grpc_service.h |  4 ++--
 paddle/fluid/operators/listen_and_serv_op.cc | 12 ++----------
 4 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index ba9882ce24..f8ec39e8c5 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "grpc_client.h"
-#include <sys/time.h>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+#include <limits>
+
 #include "paddle/fluid/framework/threadpool.h"
 
 namespace paddle {
@@ -52,7 +54,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
     call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   });
 
   req_count_++;
@@ -109,7 +111,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
     call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   });
 
   req_count_++;
@@ -153,7 +155,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
         s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
         &cq_);
     call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   });
 
   req_count_++;
@@ -169,7 +171,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
   sendrecv::VariableMessage req;
   req.set_varname(BATCH_BARRIER_MESSAGE);
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   req_count_++;
 }
 
@@ -181,7 +183,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
   auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   req_count_++;
 }
 
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index b8fba06c7b..71acc568a9 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detail/grpc_server.h"
-#include <paddle/fluid/operators/detail/send_recv.pb.h>
+
+#include <limits>
+#include <string>
 
 using ::grpc::ServerAsyncResponseWriter;
 
@@ -224,6 +226,7 @@ void AsyncGRPCServer::ShutdownQueue() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   cq_send_->Shutdown();
   cq_get_->Shutdown();
+  cq_prefetch_->Shutdown();
 }
 
 // This URL explains why shutdown is complicate:
@@ -236,6 +239,7 @@ void AsyncGRPCServer::ShutDown() {
 void AsyncGRPCServer::TryToRegisterNewSendOne() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
   RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_,
@@ -246,6 +250,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
 void AsyncGRPCServer::TryToRegisterNewGetOne() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
     return;
   }
   RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
@@ -257,6 +262,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
   VLOG(4) << "TryToRegisterNewPrefetchOne in";
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
     return;
   }
   RequestPrefetch* prefetch =
@@ -274,18 +280,21 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
 
   void* tag = NULL;
   bool ok = false;
+
   while (true) {
+    VLOG(3) << "HandleRequest for " << cq_name << " while in";
     if (!cq->Next(&tag, &ok)) {
       LOG(INFO) << cq_name << " CompletionQueue shutdown!";
       break;
     }
+    VLOG(3) << "HandleRequest for " << cq_name << " while after Next";
 
     PADDLE_ENFORCE(tag);
     // FIXME(typhoonzero): de-couple the barriers with recv_op
     if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
     if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
 
-    RequestBase* base = (RequestBase*)tag;
+    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
     // reference:
     // https://github.com/tensorflow/tensorflow/issues/5596
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index 1ec8cf11c5..e6dab2f5a3 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -89,7 +89,7 @@ inline const char* GrpcMethodName(GrpcMethod id) {
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
     case GrpcMethod::kPrefetchVariable:
-      return "/sendrecv.SendREcvService/PrefetchVariable";
+      return "/sendrecv.SendRecvService/PrefetchVariable";
   }
 
   // Shouldn't be reached.
@@ -117,5 +117,5 @@ class GrpcService final {
 };
 
 }  // namespace detail
-}  // namespace operator
+}  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 66f7058eac..67ee47f9f6 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -13,22 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdint.h>
-#include <sys/stat.h>
 #include <ostream>
-#include <thread>
-
-#include <unistd.h>
 
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/proto_desc.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-#include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace operators {
@@ -177,7 +168,8 @@ class ListenAndServOp : public framework::OperatorBase {
       }
       ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);
 
-      VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts;
+      VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts
+              << "(ms)";
 
       // Reset the received sparse variables, the sum operator would not
       // sum the input sparse variables which rows is empty at the next

From 9af9effc93e39427c758343f6be9892652049863 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Apr 2018 09:26:09 +0800
Subject: [PATCH 21/57] optimize code

---
 paddle/fluid/operators/detail/grpc_client.cc | 3 +--
 paddle/fluid/operators/detail/grpc_server.cc | 1 -
 paddle/fluid/operators/detail/grpc_server.h  | 4 +++-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index f8ec39e8c5..d79ba6d291 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -72,8 +72,7 @@ void ProcGetResponse(const VarHandle& var_h,
 template <typename T>
 void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   ::grpc::Slice slice(proto.ByteSizeLong());
-  proto.SerializeWithCachedSizesToArray(
-      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(slice.begin())));
+  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
   ::grpc::ByteBuffer tmp(&slice, 1);
   result->Swap(&tmp);
 }
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 71acc568a9..09ca4cc052 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -259,7 +259,6 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
 }
 
 void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
-  VLOG(4) << "TryToRegisterNewPrefetchOne in";
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
     VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index dd5cf4b377..b0596d3cd1 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -15,7 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <grpc++/grpc++.h>
-#include <thread>
+#include <string>
+#include <utility>
 
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -93,6 +94,7 @@ class AsyncGRPCServer final {
 
   // received variable from RPC, operators fetch variable from this queue.
   SimpleBlockQueue<MessageWithName> var_get_queue_;
+  // client send variable to this queue.
   ReceivedQueue var_recv_queue_;
 
   // condition of the sub program

From 606c57da23511b4474123db519a67ede21de9d67 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 2 Apr 2018 09:33:08 +0800
Subject: [PATCH 22/57] update by comment

---
 paddle/fluid/operators/detail/grpc_server.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 26bef375cb..44c23db0b1 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -153,10 +153,10 @@ class RequestPrefetch final : public RequestBase {
 
   virtual void Process() {
     // prefetch process...
-    ::grpc::ByteBuffer relay;
+    ::grpc::ByteBuffer reply;
     // TODO(Yancey1989): execute the Block which containers prefetch ops
 
-    responder_.Finish(relay, ::grpc::Status::OK, this);
+    responder_.Finish(reply, ::grpc::Status::OK, this);
     status_ = FINISH;
   }
 

From 6cfc0c14971828ee9528502a2787456869210a5c Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Mon, 2 Apr 2018 11:15:52 +0800
Subject: [PATCH 23/57] "polish code" (#9318)

* "polish code"

* "fix ci"

* "fix ci"

* "done"
---
 python/paddle/fluid/executor.py | 73 ++++++++-------------------------
 1 file changed, 18 insertions(+), 55 deletions(-)

diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2612fb1ae4..54d0a12bcd 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -48,8 +48,7 @@ def as_numpy(tensor):
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
     if len(lod) > 0:
-        raise RuntimeError(
-            "Some of your featched tensors hold LoD information. \
+        raise RuntimeError("Some of your fetched tensors hold LoD information. \
             They can not be completely cast to Python ndarray. \
             Please set the parameter 'return_numpy' as 'False' to \
             return LoDTensor itself directly.")
@@ -180,60 +179,24 @@ def get_program_cache_key(feed, fetch_list):
 
 
 class Executor(object):
-    def __init__(self, places):
-        if not isinstance(places, list) and not isinstance(places, tuple):
-            places = [places]
-
-        act_places = []
-        for each in places:
-            p = core.Place()
-            p.set_place(each)
-            act_places.append(p)
-
-        # TODO(dzhwinter) : only use the first place
-        self.executor = core.Executor(act_places[0])
-        self.places = places
+    def __init__(self, place):
+        self.place = place
+        p = core.Place()
+        p.set_place(place)
+        self.executor = core.Executor(p)
         self.program_caches = dict()
 
-    def aslodtensor(self, data):
-        def accumulate(data):
-            if not isinstance(data, list):
-                return 1
-            return sum([accumulate(sub) for sub in data])
-
-        def parselod(data):
-            seq_lens = [accumulate(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            return lod
-
-        assert len(self.places) != 0
-        if not isinstance(data, list):
-            # pure tensor case
-            tensor = core.LoDTensor()
-            tensor.set(data, self.places[0])
-            return tensor
-        else:
-            raise RuntimeError("Current implementation lacks unittests")
-            # lodtensor case
-            lod = []
-            if not isinstance(data[0], list):
-                lod.append(parselod(data))
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            else:
-                while isinstance(data[0], list):
-                    lod.append(parselod(seq))
-                    flattened_data = [item for seq in data for item in seq]
-                    data = flattened_data
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            flattened_data = flattened_data.reshape([len(flattened_data), 1])
-            tensor = core.LoDTensor()
-            tensor.set(flattened_data, self.places[0])
-            tensor.set_lod(lod)
-            return tensor
+    def as_lodtensor(self, data):
+        if isinstance(data, list):
+            raise RuntimeError("Some of your feed data hold LoD information. \
+                They can not be completely cast from a list of Python \
+                ndarray to LoDTensor. Please convert data to LoDTensor \
+                directly before feeding the data.\
+                ")
+        # single tensor case
+        tensor = core.LoDTensor()
+        tensor.set(data, self.place)
+        return tensor
 
     def _get_program_cache(self, program_cache_key):
         return self.program_caches.get(program_cache_key, None)
@@ -293,7 +256,7 @@ class Executor(object):
                 feed_target_name = op.desc.output('Out')[0]
                 cur_feed = feed[feed_target_name]
                 if not isinstance(cur_feed, core.LoDTensor):
-                    cur_feed = self.aslodtensor(cur_feed)
+                    cur_feed = self.as_lodtensor(cur_feed)
                 idx = op.desc.attr('col')
                 core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
             else:

From 04a5c0378517ec08f2eba1339de94bd2e786e516 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Apr 2018 11:18:00 +0800
Subject: [PATCH 24/57] add todo

---
 paddle/fluid/operators/listen_and_serv_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 67ee47f9f6..b19add24e2 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -102,6 +102,7 @@ class ListenAndServOp : public framework::OperatorBase {
 
     framework::Executor executor(dev_place);
 
+    // TODO(qiao) set proper fields for table lookup and update
     rpc_service_->SetExecutor(&executor);
     rpc_service_->SetPrefetchBlkdId(0);
     rpc_service_->SetProgram(program);

From 772cdfe196f6a343ad20f3c2644c078e4e9ef19e Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 2 Apr 2018 12:25:01 +0800
Subject: [PATCH 25/57] fix single pserver error

---
 python/paddle/fluid/distribute_transpiler.py | 28 +++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 24297ffe33..9311fc9904 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -276,20 +276,25 @@ class DistributeTranspiler:
             suff_idx = v.name.find(".trainer_")
             if suff_idx >= 0:
                 orig_var_name = v.name[:suff_idx]
-            pserver_program.global_block().create_var(
+            else:
+                orig_var_name = v.name
+            single_trainer_var = pserver_program.global_block().create_var(
                 name=orig_var_name,
                 persistable=True,
                 type=v.type,
                 dtype=v.dtype,
                 shape=v.shape)
-            for trainer_id in xrange(self.trainers):
-                var = pserver_program.global_block().create_var(
-                    name="%s.trainer_%d" % (orig_var_name, trainer_id),
-                    persistable=False,
-                    type=v.type,
-                    dtype=v.dtype,
-                    shape=v.shape)
-                recv_inputs.append(var)
+            if self.trainers > 1:
+                for trainer_id in xrange(self.trainers):
+                    var = pserver_program.global_block().create_var(
+                        name="%s.trainer_%d" % (orig_var_name, trainer_id),
+                        persistable=False,
+                        type=v.type,
+                        dtype=v.dtype,
+                        shape=v.shape)
+                    recv_inputs.append(var)
+            else:
+                recv_inputs.append(single_trainer_var)
 
         # step3
         optimize_block = pserver_program.create_block(0)
@@ -511,8 +516,11 @@ class DistributeTranspiler:
 
     def _append_split_op(self, program, gradblocks):
         # Split variables that need to be split and append respective ops
+        add_suffix = False
+        if self.trainers > 1:
+            add_suffix = True
         var_mapping = self._create_vars_from_blocklist(
-            program, gradblocks, add_trainer_suffix=True)
+            program, gradblocks, add_trainer_suffix=add_suffix)
         for varname, splited_vars in var_mapping.iteritems():
             # variable that don't need to split have empty splited_vars
             if len(splited_vars) <= 1:

From de5e56bee8cdc92f4a9417c3b91dd6084ac86b79 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 2 Apr 2018 13:06:46 +0800
Subject: [PATCH 26/57]  add og has been broadcasted

---
 .../fluid/framework/details/multi_devices_graph_builder.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index a1b913a863..1aa33768c8 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -55,6 +55,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
   auto graph = new SSAGraph();
   SSAGraph &result = *graph;
+  std::unordered_set<std::string> og_has_bc;
   result.vars_.resize(places_.size());
 
   bool is_forwarding = true;
@@ -123,8 +124,10 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     if (!is_forwarding) {
       auto var_names = op->OutputArgumentNames();
       for (auto &og : var_names) {
-        if (grad_names_.count(og) != 0) {  // is param grad
-                                           // Insert NCCL AllReduce Op
+        if (grad_names_.count(og) != 0 &&
+            og_has_bc.count(og) == 0) {  // is param grad
+                                         // Insert NCCL AllReduce Op
+          og_has_bc.insert(og);
 #ifdef PADDLE_WITH_CUDA
           result.ops_.emplace_back(
               new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));

From 19b4a2a5169afe597745f9543d6d4e5af45aa2f1 Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Mon, 2 Apr 2018 13:20:23 +0800
Subject: [PATCH 27/57] Fix some dead links for cn version

---
 doc/fluid/dev/index_cn.rst                              | 6 +++---
 doc/fluid/dev/index_en.rst                              | 2 +-
 doc/fluid/dev/{new_op_kernel_en.md => new_op_kernel.md} | 0
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename doc/fluid/dev/{new_op_kernel_en.md => new_op_kernel.md} (100%)

diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index e70bf5dff3..f627437f35 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,9 +4,9 @@
 .. toctree::
   :maxdepth: 1
 
-  new_op_en.md
-  new_op_kernel_en.md
-  use_eigen_en.md
+  new_op_cn.md
+  new_op_kernel.md
+  use_eigen_cn.md
   name_convention.md
   support_new_device.md
   releasing_process.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index f0e9afcfcc..0b65fed67a 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -5,7 +5,7 @@ Development
   :maxdepth: 1
 
   new_op_en.md
-  new_op_kernel_en.md
+  new_op_kernel.md
   use_eigen_en.md
   name_convention.md
   support_new_device.md
diff --git a/doc/fluid/dev/new_op_kernel_en.md b/doc/fluid/dev/new_op_kernel.md
similarity index 100%
rename from doc/fluid/dev/new_op_kernel_en.md
rename to doc/fluid/dev/new_op_kernel.md

From 9b6c5397c5c39864c453a19bdd2dc6ab21cee26b Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Mon, 2 Apr 2018 13:27:47 +0800
Subject: [PATCH 28/57] Merge branch develop

---
 .travis.yml                                   |   2 +-
 paddle/fluid/operators/detail/CMakeLists.txt  |   3 +-
 paddle/fluid/operators/detail/grpc_client.cc  |   3 +-
 paddle/fluid/operators/detail/grpc_server.cc  |  61 +++++++-
 paddle/fluid/operators/detail/grpc_server.h   |  15 ++
 .../operators/detail/grpc_server_test.cc      |  51 +++++++
 paddle/fluid/operators/detail/grpc_service.h  |   3 +
 paddle/fluid/operators/detail/send_recv.proto |   2 +
 paddle/fluid/operators/reshape_op.cc          | 130 ++++++++----------
 paddle/fluid/operators/reshape_op.h           | 127 ++++++++++++++++-
 python/paddle/fluid/executor.py               |  73 +++-------
 python/paddle/fluid/layers/detection.py       |  21 ++-
 python/paddle/fluid/layers/nn.py              |  98 +++++++++++++
 python/paddle/fluid/layers/ops.py             |   1 -
 .../paddle/fluid/tests/unittests/op_test.py   |   8 +-
 .../unittests/test_mine_hard_examples_op.py   |   0
 .../fluid/tests/unittests/test_reshape_op.py  | 101 ++++++++++++--
 .../tests/unittests/test_target_assign_op.py  |   0
 18 files changed, 529 insertions(+), 170 deletions(-)
 create mode 100644 paddle/fluid/operators/detail/grpc_server_test.cc
 mode change 100755 => 100644 python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
 mode change 100755 => 100644 python/paddle/fluid/tests/unittests/test_target_assign_op.py

diff --git a/.travis.yml b/.travis.yml
index bf6a41d13c..929c847bd3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,7 +34,7 @@ addons:
       - automake
       - libtool
       - ccache
-  ssh_known_hosts: 52.76.173.135
+  ssh_known_hosts: 13.229.163.131
 before_install:
   - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index d59411dfb9..f8cd2852f3 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -2,7 +2,8 @@ if(WITH_DISTRIBUTE)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
       grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(serde_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(serde_test.cc grpc_server_test PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
       cares zlib protobuf sendrecvop_grpc)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
 endif()
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 9652bb888b..ba9882ce24 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -150,7 +150,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
     s->response_call_back_ = ProcGetResponse;
 
     auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_);
+        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
+        &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, (void*)s);
   });
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 109c762e74..591b3e334a 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -128,6 +128,47 @@ class RequestGet final : public RequestBase {
   SimpleBlockQueue<MessageWithName>* queue_;
 };
 
+class RequestPrefetch final : public RequestBase {
+ public:
+  explicit RequestPrefetch(GrpcService::AsyncService* service,
+                           ::grpc::ServerCompletionQueue* cq,
+                           framework::Scope* scope,
+                           const platform::DeviceContext* dev_ctx,
+                           framework::Executor* executor,
+                           framework::ProgramDesc* program, int blkid)
+      : RequestBase(service, cq, dev_ctx),
+        responder_(&ctx_),
+        scope_(scope),
+        executor_(executor),
+        program_(program),
+        blkid_(blkid) {
+    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
+    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
+                                cq_, this);
+  }
+
+  virtual ~RequestPrefetch() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    // prefetch process...
+    ::grpc::ByteBuffer reply;
+    // TODO(Yancey1989): execute the Block which containers prefetch ops
+
+    responder_.Finish(reply, ::grpc::Status::OK, this);
+    status_ = FINISH;
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  framework::Scope* scope_;
+  framework::Executor* executor_;
+  framework::ProgramDesc* program_;
+  int blkid_;
+};
+
 void AsyncGRPCServer::WaitClientGet(int count) {
   int fetch_barriers = 0;
   while (fetch_barriers < count) {
@@ -147,6 +188,7 @@ void AsyncGRPCServer::RunSyncUpdate() {
 
   cq_send_ = builder.AddCompletionQueue();
   cq_get_ = builder.AddCompletionQueue();
+  cq_prefetch_ = builder.AddCompletionQueue();
 
   server_ = builder.BuildAndStart();
   LOG(INFO) << "Server listening on " << address_ << std::endl;
@@ -155,6 +197,8 @@ void AsyncGRPCServer::RunSyncUpdate() {
       std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
   std::function<void()> get_register =
       std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+  std::function<void()> prefetch_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
 
   t_send_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
@@ -163,11 +207,14 @@ void AsyncGRPCServer::RunSyncUpdate() {
   t_get_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_get_.get(), "cq_get", get_register)));
-
+  t_prefetch_.reset(new std::thread(
+      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
+                "cq_prefetch", prefetch_register)));
   // wait server
   server_->Wait();
   t_send_->join();
   t_get_->join();
+  t_prefetch_->join();
 }
 
 void AsyncGRPCServer::ShutdownQueue() {
@@ -203,6 +250,18 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
   VLOG(4) << "Create RequestGet status:" << get->Status();
 }
 
+void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestPrefetch* prefetch =
+      new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
+                          executor_, program_, prefetch_blk_id_);
+
+  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+}
+
 // FIXME(typhoonzero): change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
                                     std::string cq_name,
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 10e6dd45a9..dd5cf4b377 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -17,7 +17,9 @@ limitations under the License. */
 #include <grpc++/grpc++.h>
 #include <thread>
 
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -53,6 +55,12 @@ class AsyncGRPCServer final {
 
   void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
 
+  void SetProgram(framework::ProgramDesc *program) { program_ = program; }
+
+  void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; }
+
+  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
+
   const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
 
   void Push(const std::string &msg_name) {
@@ -66,6 +74,7 @@ class AsyncGRPCServer final {
                      std::function<void()> TryToRegisterNewOne);
   void TryToRegisterNewSendOne();
   void TryToRegisterNewGetOne();
+  void TryToRegisterNewPrefetchOne();
   void ShutdownQueue();
 
  private:
@@ -73,6 +82,7 @@ class AsyncGRPCServer final {
   volatile bool is_shut_down_ = false;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
 
   GrpcService::AsyncService service_;
   std::unique_ptr<::grpc::Server> server_;
@@ -92,6 +102,11 @@ class AsyncGRPCServer final {
 
   std::unique_ptr<std::thread> t_send_;
   std::unique_ptr<std::thread> t_get_;
+  std::unique_ptr<std::thread> t_prefetch_;
+
+  int prefetch_blk_id_;
+  framework::ProgramDesc *program_;
+  framework::Executor *executor_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
new file mode 100644
index 0000000000..5773748106
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace detail = paddle::operators::detail;
+
+std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+
+void StartServer(const std::string& endpoint) {
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+}
+
+TEST(PREFETCH, CPU) {
+  // start up a server instance backend
+  // TODO(Yancey1989): Need to start a server with optimize blocks and
+  // prefetch blocks.
+  std::thread server_thread(StartServer, "127.0.0.1:8889");
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // create var on local scope
+  std::string var_name("tmp_0");
+  auto var = scope.Var(var_name);
+  auto tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize({10, 10});
+
+  detail::RPCClient client;
+  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, var_name, "");
+  server_thread.join();
+  rpc_service_.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index ae6f9db3bd..879e21933b 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -76,6 +76,7 @@ namespace detail {
 enum class GrpcMethod {
   kSendVariable,
   kGetVariable,
+  kPrefetchVariable,
 };
 
 static const int kGrpcNumMethods =
@@ -87,6 +88,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kPrefetchVariable:
+      return "/sendrecv.SendREcvService/PrefetchVariable";
   }
 
   // Shouldn't be reached.
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index 2d33f026e4..fc12e82a7e 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -21,6 +21,8 @@ service SendRecvService {
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
   // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // Prefetch variable by Ids
+  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 832509641c..b87b8e6b26 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -17,90 +17,66 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
-    auto x_dims = ctx->GetInputDim("X");
-
-    std::vector<size_t> neg_dims_idx;
-    // set some dimension to -1 if it is unknown
-    const int unknown_size = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
-                     "Each dimension of Attr(shape) must be positive or %d.",
-                     unknown_size);
-      if (shape[i] == unknown_size) {
-        neg_dims_idx.push_back(i);
-        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
-                       "Only one dimension of Attr(shape) can be unknown.");
-      }
-    }
-
-    int64_t capacity =
-        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    int64_t in_size = framework::product(x_dims);
-    if (neg_dims_idx.size() == 1) {
-      // dim infer
-      shape[neg_dims_idx[0]] = in_size / (-capacity);
-      // recalculate capacity
-      capacity = shape[neg_dims_idx[0]] * (-capacity);
-    }
-    // capacity check
-    PADDLE_ENFORCE(capacity == in_size,
-                   "The size of Input(X) mismatches with Attr(shape).");
-    // resize output
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto out_dims = framework::make_ddim(shape_int64);
-    ctx->SetOutputDim("Out", out_dims);
-    if (shape[0] == x_dims[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of reshape operator.");
-    AddOutput("Out", "The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "Target shape of reshape operator.");
+    AddInput("X", "(Tensor). The input tensor of reshape operator.");
+    AddInput("Shape",
+             "(Tensor<int32>, optional). If provided, reshape according to "
+             "this given shape. That is to say it has a higher priority than "
+             "the shape attribute, while the shape attribute still should be "
+             "set correctly to gurantee shape inference in compile time.")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
+    AddAttr<std::vector<int>>(
+        "shape", "(std::vector<int>) Target shape of reshape operator.");
     AddAttr<bool>("inplace",
-                  "Change the source tensor's shape without copy memory.")
-        .SetDefault(true);
+                  "(default: false) Change the source tensor's shape without "
+                  "memory copy. When Attr(inplace) is set true, the output "
+                  "tensor shares memory with Input(X), otherwise, a new output "
+                  "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
-Reshape Input(X) into the shape specified by Attr(shape).
+Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The
+data in Input(X) are unchanged.
+
+Examples:
 
-An example:
-Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
+1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X)
+into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged.
 
-and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
+2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data
+unchanged. In this case, one and only dimension of Attr(shape) can be set to -1,
+the value of this dimension is inferred from the total element number of
+Input(X) and remaining dimensions.
+
+3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data
+unchanged. In this case, besides -1, 0 means the actual dimension value is going
+to be copied from the corresponding dimension of Input(X).
+
+Note:
+
+1. One and only one dimension in Attr(shape) can be set -1. In this case,
+the actual dimension value will be infered from the total element number of
+Input(X) and remaining dimensions.
+
+2. More than one dimensions in Attr(shape) can be set to 0, which means the real
+dimension value will be copied from Input(X) at runtime. Note that the index of
+0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape
+[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
+
+3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
+Attr(shape) still should be set correctly to gurantee shape inference in 
+compile-time.
 
-One dimension in the target shape can be set -1, representing that its
-size is unknown. In this case, the real dimension will be infered from 
-the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
   }
 };
@@ -119,6 +95,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) shouldn't be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index eacb0a0cf2..871b4d38d5 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -20,17 +20,129 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
+      // If true, set the shape of Output(Out) according to Input(Shape) in
+      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
+      ctx->ShareLoD("X", /*->*/ "Out");
+      return;
+    }
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+
+  static framework::DDim ValidateShape(const std::vector<int> shape,
+                                       const framework::DDim &in_dims) {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension canbe set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
+
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
+      }
+
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+    }
+
+    if (unk_dim_idx != -1) {
+      output_shape[unk_dim_idx] = -in_size / capacity;
+      PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                        "Invalid shape is given.");
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
 template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* in = ctx.Input<framework::Tensor>("X");
+  void Compute(const framework::ExecutionContext &ctx) const {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
+    auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
+
+    framework::DDim out_dims = out->dims();
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor cpu_shape_tensor;
+        TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
+                   &cpu_shape_tensor);
+        shape_data = cpu_shape_tensor.data<int>();
+      }
+      auto shape =
+          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+    }
+    if (!in->lod().empty()) {
+      PADDLE_ENFORCE_EQ(
+          out_dims[0], in->dims()[0],
+          "Reshape operator cannot reshape an input sequence batch "
+          "into an output sequence batch that has a different "
+          "number of time steps. Please consider using "
+          "sequence_reshape op.");
+    }
+
     bool inplace = ctx.Attr<bool>("inplace");
-    auto out_dims = out->dims();
+    out->Resize(out_dims);
     if (!inplace) {
       out->mutable_data<T>(ctx.GetPlace());
       framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      // TensorCopy will resize to in_dims.
       out->Resize(out_dims);
     } else {
       out->ShareDataWith(*in);
@@ -42,9 +154,10 @@ class ReshapeKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class ReshapeGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+  void Compute(const framework::ExecutionContext &ctx) const {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     d_x->mutable_data<T>(ctx.GetPlace());
     bool inplace = ctx.Attr<bool>("inplace");
 
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2612fb1ae4..54d0a12bcd 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -48,8 +48,7 @@ def as_numpy(tensor):
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
     if len(lod) > 0:
-        raise RuntimeError(
-            "Some of your featched tensors hold LoD information. \
+        raise RuntimeError("Some of your fetched tensors hold LoD information. \
             They can not be completely cast to Python ndarray. \
             Please set the parameter 'return_numpy' as 'False' to \
             return LoDTensor itself directly.")
@@ -180,60 +179,24 @@ def get_program_cache_key(feed, fetch_list):
 
 
 class Executor(object):
-    def __init__(self, places):
-        if not isinstance(places, list) and not isinstance(places, tuple):
-            places = [places]
-
-        act_places = []
-        for each in places:
-            p = core.Place()
-            p.set_place(each)
-            act_places.append(p)
-
-        # TODO(dzhwinter) : only use the first place
-        self.executor = core.Executor(act_places[0])
-        self.places = places
+    def __init__(self, place):
+        self.place = place
+        p = core.Place()
+        p.set_place(place)
+        self.executor = core.Executor(p)
         self.program_caches = dict()
 
-    def aslodtensor(self, data):
-        def accumulate(data):
-            if not isinstance(data, list):
-                return 1
-            return sum([accumulate(sub) for sub in data])
-
-        def parselod(data):
-            seq_lens = [accumulate(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            return lod
-
-        assert len(self.places) != 0
-        if not isinstance(data, list):
-            # pure tensor case
-            tensor = core.LoDTensor()
-            tensor.set(data, self.places[0])
-            return tensor
-        else:
-            raise RuntimeError("Current implementation lacks unittests")
-            # lodtensor case
-            lod = []
-            if not isinstance(data[0], list):
-                lod.append(parselod(data))
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            else:
-                while isinstance(data[0], list):
-                    lod.append(parselod(seq))
-                    flattened_data = [item for seq in data for item in seq]
-                    data = flattened_data
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            flattened_data = flattened_data.reshape([len(flattened_data), 1])
-            tensor = core.LoDTensor()
-            tensor.set(flattened_data, self.places[0])
-            tensor.set_lod(lod)
-            return tensor
+    def as_lodtensor(self, data):
+        if isinstance(data, list):
+            raise RuntimeError("Some of your feed data hold LoD information. \
+                They can not be completely cast from a list of Python \
+                ndarray to LoDTensor. Please convert data to LoDTensor \
+                directly before feeding the data.\
+                ")
+        # single tensor case
+        tensor = core.LoDTensor()
+        tensor.set(data, self.place)
+        return tensor
 
     def _get_program_cache(self, program_cache_key):
         return self.program_caches.get(program_cache_key, None)
@@ -293,7 +256,7 @@ class Executor(object):
                 feed_target_name = op.desc.output('Out')[0]
                 cur_feed = feed[feed_target_name]
                 if not isinstance(cur_feed, core.LoDTensor):
-                    cur_feed = self.aslodtensor(cur_feed)
+                    cur_feed = self.as_lodtensor(cur_feed)
                 idx = op.desc.attr('col')
                 core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
             else:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3e649dc5fd..a5938fe494 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -19,7 +19,6 @@ from layer_function_generator import generate_layer_fn
 from layer_function_generator import autodoc
 from ..layer_helper import LayerHelper
 import tensor
-import ops
 import nn
 import math
 
@@ -58,7 +57,7 @@ def detection_output(loc,
 
     This operation is to get the detection results by performing following
     two steps:
-    
+
     1. Decode input bounding box predictions according to the prior boxes.
     2. Get the final detection results by applying multi-class non maximum
        suppression (NMS).
@@ -130,9 +129,9 @@ def detection_output(loc,
         target_box=loc,
         code_type='decode_center_size')
     old_shape = scores.shape
-    scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
+    scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
     scores = nn.softmax(input=scores)
-    scores = ops.reshape(x=scores, shape=old_shape)
+    scores = nn.reshape(x=scores, shape=old_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
     scores.stop_gradient = True
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
@@ -463,7 +462,7 @@ def ssd_loss(location,
     num, num_prior, num_class = confidence.shape
 
     def __reshape_to_2d(var):
-        return ops.reshape(x=var, shape=[-1, var.shape[-1]])
+        return nn.reshape(x=var, shape=[-1, var.shape[-1]])
 
     # 1. Find matched boundding box by prior box.
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
@@ -474,7 +473,7 @@ def ssd_loss(location,
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
-    gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
     gt_label.stop_gradient = True
     target_label, _ = target_assign(
         gt_label, matched_indices, mismatch_value=background_label)
@@ -487,7 +486,7 @@ def ssd_loss(location,
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
 
     # 3. Mining hard examples
-    conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
     conf_loss.stop_gradient = True
     neg_indices = helper.create_tmp_variable(dtype='int32')
     dtype = matched_indices.dtype
@@ -556,7 +555,7 @@ def ssd_loss(location,
     # 5.3 Compute overall weighted loss.
     loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
     # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    loss = ops.reshape(x=loss, shape=[-1, num_prior])
+    loss = nn.reshape(x=loss, shape=[-1, num_prior])
     loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
     if normalize:
         normalizer = nn.reduce_sum(target_loc_weight)
@@ -709,7 +708,7 @@ def multi_box_head(inputs,
         new_shape = [
             -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
         ]
-        out = ops.reshape(x=input, shape=new_shape)
+        out = nn.reshape(x=input, shape=new_shape)
         return out
 
     def _is_list_or_tuple_(data):
@@ -803,7 +802,7 @@ def multi_box_head(inputs,
             mbox_loc.shape[0],
             mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
         ]
-        mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
+        mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
         mbox_locs.append(mbox_loc_flatten)
 
         # get conf
@@ -819,7 +818,7 @@ def multi_box_head(inputs,
             conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
             conf_loc.shape[3] / num_classes, num_classes
         ]
-        conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape)
+        conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
         mbox_confs.append(conf_loc_flatten)
 
     if len(box_results) == 1:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0332556f62..e59ee25120 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -73,6 +73,7 @@ __all__ = [
     'smooth_l1',
     'one_hot',
     'autoincreased_step_counter',
+    'reshape',
     'lod_reset',
     'lrn',
 ]
@@ -3265,6 +3266,8 @@ def one_hot(input, depth):
          The one-hot tensor or LodTensor, same as input.
 
     Examples:
+        .. code-block:: python
+
         X is a LoDTensor:
           X.lod = [[0, 1, 4]]
           X.shape = [4, 1]
@@ -3319,6 +3322,101 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     return counter
 
 
+def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
+    """
+    Gives a new shape to the input Tensor without changing its data.
+
+    The target shape can be given by :attr:`shape` or :attr:`actual_shape`.
+    :attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor
+    variable. :attr:`actual_shape` has a higher priority than :attr:`shape`
+    if it is provided, while :attr:`shape` still should be set correctly to
+    gurantee shape inference in compile-time.
+
+    Some tricks exist when specifying the target shape.
+
+    1. -1 means the value of this dimension is inferred from the total element
+    number of x and remaining dimensions. Thus one and only one dimension can
+    be set -1.
+
+    2. 0 means the actual dimension value is going to be copied from the
+    corresponding dimension of x. The indice of 0s in shape can not exceed
+    Rank(X).
+
+    Here are some examples to explain it.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with 
+    shape [6, 8] and leaving x's data unchanged.
+
+    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified is [2, 3, -1, 2], the reshape operator will transform x into a
+    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
+    case, one dimension of the target shape is set to -1, the value of this 
+    dimension is inferred from the total element number of x and remaining 
+    dimensions.
+
+    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
+    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
+    besides -1, 0 means the actual dimension value is going to be copied from
+    the corresponding dimension of x.
+
+    Args:
+        input(variable): The input tensor.
+        shape(list): The new shape. At most one dimension of the new shape can
+                     be -1.
+        actual_shape(variable): An optional input. If provided, reshape
+                                according to this given shape rather than
+                                :attr:`shape` specifying shape. That is to
+                                say :attr:`actual_shape` has a higher priority
+                                than :attr:`shape`.
+        act (str): The non-linear activation to be applied to output variable.
+        inplace(bool): If this flag is set true, a new output tensor is created
+                       whose data is copied from input x, otherwise the output
+                       shares data with input without copying.
+
+    Returns(variable): The output tensor.
+
+    Examples:
+        .. code-block:: python
+            data = fluid.layers.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.reshape(
+                x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True)
+    """
+
+    if not (isinstance(shape, list) or isinstance(shape, tuple)):
+        raise ValueError("Input shape must be a python lsit or tuple.")
+
+    # Validate the shape
+    unk_dim_idx = -1
+    for dim_idx, dim_size in enumerate(shape):
+        if dim_size == -1:
+            assert unk_dim_idx == -1, (
+                "Only one dimension in shape can be unknown.")
+            unk_dim_idx = dim_idx
+        elif dim_size == 0:
+            assert dim_idx < len(x.shape), (
+                "The indice of 0s in shape can not exceed Rank(X).")
+        else:
+            assert dim_size > 0, (
+                "Each dimension size given in shape must not be negtive "
+                "except one unknown dimension.")
+
+    helper = LayerHelper("reshape", **locals())
+    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reshape",
+        inputs={"X": x,
+                "Shape": actual_shape}
+        if isinstance(actual_shape, Variable) else {"X": x},
+        attrs={"shape": shape,
+               "inplace": inplace},
+        outputs={"Out": reshaped})
+
+    return helper.append_activation(reshaped)
+
+
 def lod_reset(x, y=None, target_lod=None):
     """
     LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 0e5987ee59..a9fe25744c 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -49,7 +49,6 @@ __activations__ = [
 __all__ = [
     'mean',
     'mul',
-    'reshape',
     'scale',
     'sigmoid_cross_entropy_with_logits',
     'elementwise_add',
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8393f7827b..299ab8e51f 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + str(expect_t))
+                    str(actual_t) + "\n" + str(expect_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.lod(), expect[1],
                                          "Output (" + out_name +
@@ -568,6 +568,6 @@ class OpTest(unittest.TestCase):
 
         fetch_list = [g for p, g in param_grad_list]
         executor = Executor(place)
-        return map(
-            np.array,
-            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
+        return map(np.array,
+                   executor.run(prog, feed_dict, fetch_list,
+                                return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 11f35c74d4..f51b5a7e99 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -14,15 +14,19 @@
 
 import unittest
 import numpy as np
+
 from op_test import OpTest
 
 
 class TestReshapeOp(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -31,12 +35,33 @@ class TestReshapeOp(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInfer(OpTest):
+class TestReshapeOpDimInfer1(OpTest):
     def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInfer2(OpTest):
+    def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -47,10 +72,30 @@ class TestReshapeOpDimInfer(OpTest):
 
 class TestReshapeOpInplace(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInferInplace1(OpTest):
+    def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20], 'inplace': True}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -59,12 +104,38 @@ class TestReshapeOpInplace(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInferInplace(OpTest):
+class TestReshapeOpDimInferInplace2(OpTest):
     def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpWithInputShape(OpTest):
+    def setUp(self):
+        ori_shape = (6, 5)
+        new_shape = (0, -1, 5)
+        actual_shape = (2, 3, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5], 'inplace': True}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {
+            "X": np.random.random(ori_shape).astype("float32"),
+            "Shape": np.array(
+                actual_shape, dtype="int32")
+        }
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -73,5 +144,5 @@ class TestReshapeOpDimInferInplace(OpTest):
         self.check_grad(["X"], "Out")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
old mode 100755
new mode 100644

From 997e9a1fd2a98120a269b7569fccd7f1e595059b Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Apr 2018 13:53:21 +0800
Subject: [PATCH 29/57] fix mac compile

---
 paddle/fluid/framework/details/var_handle.h | 2 +-
 paddle/fluid/framework/parallel_executor.cc | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 893cc15f6c..569dda17c6 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -22,7 +22,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct OpHandleBase;
+class OpHandleBase;
 
 // VarHandleBase is the var node in the dependency graph.
 // A variable can only be generated by a single operator. i.e.
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 91f2db9354..292e4732b4 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
+
 #include <string>
+#include <vector>
 
-#include "ThreadPool.h"
+#include "paddle/fluid/framework/threadpool.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"

From 4c8eef5739ecb64295992a5361a5f52f896895d4 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Mon, 2 Apr 2018 14:39:33 +0800
Subject: [PATCH 30/57] Add python wrapper for pad_op

---
 python/paddle/fluid/layers/nn.py | 58 ++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0332556f62..c6f831c29d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -75,6 +75,7 @@ __all__ = [
     'autoincreased_step_counter',
     'lod_reset',
     'lrn',
+    'pad',
 ]
 
 
@@ -3482,3 +3483,60 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
                "beta": beta})
 
     return lrn_out
+
+
+def pad(x, paddings, pad_value=0., name=None):
+    """
+    Pads a tensor with a constant value given by :attr:pad_value, and the
+    padded width is specified by :attr:paddings. 
+
+    Specifically, the number of values padded before each dimension 
+    :attr:i is indicated by :attr:paddings[i], and the number of values padded
+    after each dimension :attr:i is indicated by :attr:paddings[i+1].
+
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            x = [[1, 2], [3, 4]]
+
+            paddings = [0, 1, 1, 2]
+
+            pad_value = 0
+
+        Return:
+
+            out = [[0, 1, 2, 0, 0]
+                   [0, 3, 4, 0, 0]
+                   [0, 0, 0, 0, 0]]
+
+    Args:
+        x (Variable): The input tensor variable.
+        paddings (list): A list of integers. Its elements specify the padded
+                         width before and after for each dimension in turn.
+                         The length of :attr:paddings must be 
+                         :math:`rank(x) \\times 2`.
+        pad_value (float): The constant value used to pad.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The padded tensor variable.
+
+    Examples:
+        .. code-block:: python
+            # x is a rank 2 tensor variable.
+            out = fluid.layers.pad(
+                x=x, paddings=[0, 1, 1, 2], pad_value=0.)
+    """
+    helper = LayerHelper('pad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='pad',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'paddings': paddings,
+               'pad_value': float(pad_value)})
+    return out

From 9a101cfc08b90832cfa44b9cad1e25db640b7948 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Apr 2018 15:05:14 +0800
Subject: [PATCH 31/57] clean code

---
 paddle/fluid/framework/parallel_executor.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 292e4732b4..577eea92d2 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/threadpool.h"
-
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

From 3b3d210c3e4a294d8a545521e6ea4e3ff1f5125c Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Apr 2018 16:18:00 +0800
Subject: [PATCH 32/57] lookuptable support SelectedRows as table parameter

---
 paddle/fluid/framework/selected_rows.h    |   5 +-
 paddle/fluid/operators/lookup_table_op.cc |   2 +-
 paddle/fluid/operators/lookup_table_op.h  | 115 ++++++++++++++--------
 3 files changed, 80 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index c9c2c1bb72..9458d56a01 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 
@@ -52,7 +55,7 @@ class SelectedRows {
 
  private:
   // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows are simply concated when adding together. Until a
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
   std::unique_ptr<Tensor> value_{nullptr};
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 50eeadab72..92c7d7f9ca 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -84,7 +84,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      "If the value is -1, it makes no effect to lookup. "
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(-1);
+        .SetDefault(kNoPadding);
     AddComment(R"DOC(
 Lookup Table Operator.
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index c92ce78eef..02ffbd1361 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -25,16 +28,37 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+static const int64_t kNoPadding = -1;
+
+inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
+  auto it = std::find(rows.begin(), rows.end(), value);
+  PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
+  return std::distance(rows.begin(), it);
+}
 
 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    auto *ids_var = context.InputVar("Ids");
+    Tensor *output_t = context.Output<Tensor>("Out");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+
+    DDim table_dim;
+
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+    }
 
-    int64_t* ids;
+    int64_t *ids;
     int64_t ids_numel;
 
     // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
@@ -42,39 +66,50 @@ class LookupTableKernel : public framework::OpKernel<T> {
     // when Ids's type is SelectedRows, the rows of Ids contains the
     // ids to be looked up in W.
     if (ids_var->IsType<LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
+      auto *ids_t = context.Input<LoDTensor>("Ids");
+      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
       ids_numel = ids_t->numel();
     } else if (ids_var->IsType<SelectedRows>()) {
-      auto* ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().data());
+      auto *ids_t = context.Input<SelectedRows>("Ids");
+      ids = const_cast<int64_t *>(ids_t->rows().data());
       ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_t->dims()[1]});
+      output_t->Resize({ids_numel, table_dim[1]});
     } else {
       PADDLE_THROW("Unsupported Variable Type of Ids");
     }
 
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    if (table_var->IsType<LoDTensor>()) {
+      auto *table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];
 
-    int N = table_t->dims()[0];
-    int D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
 
-    if (padding_idx == -1) {
       for (int64_t i = 0; i < ids_numel; ++i) {
-        PADDLE_ENFORCE_LT(ids[i], N);
-        PADDLE_ENFORCE_GE(ids[i], 0);
-        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], row_number);
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
+        }
       }
-    } else {
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto &table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto *table = table_t.value().data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
       for (int64_t i = 0; i < ids_numel; ++i) {
-        if (ids[i] == padding_idx) {
-          memset(output + i * D, 0, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
-          PADDLE_ENFORCE_LT(ids[i], N);
           PADDLE_ENFORCE_GE(ids[i], 0);
-          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+          auto id_index = getIndex(table_t.rows(), ids[i]);
+          memcpy(output + i * row_width, table + id_index * row_width,
+                 row_width * sizeof(T));
         }
       }
     }
@@ -84,17 +119,17 @@ class LookupTableKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *table = context.Input<LoDTensor>("W");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
       framework::Vector<int64_t> new_rows;
@@ -104,31 +139,31 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       }
       d_table->set_rows(new_rows);
 
-      auto* d_table_value = d_table->mutable_value();
+      auto *d_table_value = d_table->mutable_value();
       d_table_value->Resize({ids_dim[0], table->dims()[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
       d_table->set_height(table->dims()[0]);
 
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table_value->data<T>();
 
       PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
     } else {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-      auto* table = context.Input<LoDTensor>("W");
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
+      auto *table = context.Input<LoDTensor>("W");
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
       int N = table->dims()[0];
       int D = d_output->dims()[1];
 
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
 
       memset(d_table_data, 0, d_table->numel() * sizeof(T));
 

From 6fff0d4d4c05c57b5e5d417bcec7b2629c96b7e2 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Apr 2018 16:23:05 +0800
Subject: [PATCH 33/57] update LookupTableGradKernel

---
 paddle/fluid/operators/lookup_table_op.h | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 02ffbd1361..8760cc2ee9 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -120,12 +120,22 @@ template <typename T>
 class LookupTableGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    DDim table_dim;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+    }
+
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
       auto *ids = context.Input<LoDTensor>("Ids");
-      auto *table = context.Input<LoDTensor>("W");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
       auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
@@ -140,10 +150,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->Resize({ids_dim[0], table_dim[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
-      d_table->set_height(table->dims()[0]);
+      d_table->set_height(table_dim[0]);
 
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table_value->data<T>();
@@ -154,12 +164,11 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *ids = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
       auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-      auto *table = context.Input<LoDTensor>("W");
 
       auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
-      int N = table->dims()[0];
+      int N = table_dim[0];
       int D = d_output->dims()[1];
 
       auto *d_output_data = d_output->data<T>();

From a94e25740e1b6622c65178bd3ce0b40f4aeb28ce Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Apr 2018 16:24:38 +0800
Subject: [PATCH 34/57] optimize code

---
 paddle/fluid/operators/lookup_table_op.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 8760cc2ee9..fff5edda62 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -30,12 +30,12 @@ using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
-static const int64_t kNoPadding = -1;
+static constexpr int64_t kNoPadding = -1;
 
 inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
   auto it = std::find(rows.begin(), rows.end(), value);
   PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
-  return std::distance(rows.begin(), it);
+  return static_cast<size_t>(std::distance(rows.begin(), it));
 }
 
 template <typename T>

From b94f24d44f314279cfe7230db37a22e225957e15 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 2 Apr 2018 17:33:14 +0800
Subject: [PATCH 35/57] Move StartPrefetcher and EndPrefetcher to private

---
 .../operators/reader/create_double_buffer_reader_op.cc      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 342cd2a549..f9a8058f2a 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -66,6 +66,9 @@ class DoubleBufferReader : public framework::DecoratedReader {
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   void ReInit() override;
 
+  ~DoubleBufferReader() { EndPrefetcher(); }
+
+ private:
   void StartPrefetcher() {
     channel_ = framework::MakeChannel<Item>(kChannelSize);
     prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
@@ -80,9 +83,6 @@ class DoubleBufferReader : public framework::DecoratedReader {
     channel_ = nullptr;
   }
 
-  ~DoubleBufferReader() { EndPrefetcher(); }
-
- private:
   void PrefetchThreadFunc();
 
   std::thread prefetcher_;

From 7a6ffb62805e3c590b4da1f7047380a64cabcf48 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Apr 2018 18:38:41 +0800
Subject: [PATCH 36/57] add TestLookupTableWIsSelectedRows

---
 paddle/fluid/operators/lookup_table_op.cc     | 24 ++++++++---
 .../tests/unittests/test_lookup_table_op.py   | 42 +++++++++++++++++++
 2 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 92c7d7f9ca..deabcdc99f 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -18,6 +18,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+static inline framework::OpKernelType ExpectedKernelType(
+    const framework::ExecutionContext& ctx) {
+  auto* table_var = ctx.InputVar("W");
+  if (table_var->IsType<LoDTensor>()) {
+    return framework::OpKernelType(
+        framework::ToDataType(table_var->Get<LoDTensor>().type()),
+        ctx.device_context());
+  } else if (table_var->IsType<SelectedRows>()) {
+    return framework::OpKernelType(
+        framework::ToDataType(table_var->Get<SelectedRows>().value().type()),
+        ctx.device_context());
+  } else {
+    PADDLE_THROW("W should be LoDTensor or SelectedRows");
+  }
+}
+
 class LookupTableOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -51,9 +67,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    return ExpectedKernelType(ctx);
   }
 };
 
@@ -124,9 +138,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    return ExpectedKernelType(ctx);
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index ed920ad388..3f739afd25 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -96,5 +96,47 @@ class TestLookupTableIdsIsSelectedRows(OpTest):
             self.check_with_place(place)
 
 
+class TestLookupTableWIsSelectedRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Id Variable
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
+        ids_tensor.set(ids_array, place)
+
+        # create and initialize W Variable
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 12
+
+        w_selected_rows = scope.var('W').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        ids_tensor = w_selected_rows.get_tensor()
+        ids_tensor.set(w_array, place)
+
+        # create Out Variable
+        Out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
+        lookup_table.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(Out_tensor)
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def test_w_is_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()

From 30adc0b5f867aabc61367d293bd1cabb0216a51a Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 2 Apr 2018 19:06:57 +0800
Subject: [PATCH 37/57] add notation

---
 .../framework/details/multi_devices_graph_builder.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 1aa33768c8..c277bd7cb6 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -55,7 +55,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
   auto graph = new SSAGraph();
   SSAGraph &result = *graph;
-  std::unordered_set<std::string> og_has_bc;
+  std::unordered_set<std::string> og_has_been_broadcast;
   result.vars_.resize(places_.size());
 
   bool is_forwarding = true;
@@ -123,11 +123,15 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
     if (!is_forwarding) {
       auto var_names = op->OutputArgumentNames();
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once. But there are no
+      // other cases, for example, we need to adjust the gradient according to
+      // the input when we get the gradient, which is not considered at present.
       for (auto &og : var_names) {
         if (grad_names_.count(og) != 0 &&
-            og_has_bc.count(og) == 0) {  // is param grad
-                                         // Insert NCCL AllReduce Op
-          og_has_bc.insert(og);
+            og_has_been_broadcast.count(og) == 0) {  // is param grad
+                                                     // Insert NCCL AllReduce Op
+          og_has_been_broadcast.insert(og);
 #ifdef PADDLE_WITH_CUDA
           result.ops_.emplace_back(
               new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));

From 2be10ebe8ae03d2a3105fa3108a74116c41a8f66 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Apr 2018 23:01:33 +0800
Subject: [PATCH 38/57] disable test_recv_op

---
 python/paddle/fluid/tests/unittests/test_recv_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py
index 854238c627..2ebceca7e4 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@@ -23,7 +23,7 @@ import time
 
 
 class TestRecvOp(unittest.TestCase):
-    def test_send(self):
+    def no_test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
         p = Process(target=self.init_serv, args=(place, ))

From f02968bb217dc5274bbab4458738cc756f904448 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 2 Apr 2018 23:06:30 +0800
Subject: [PATCH 39/57] disable test_recv_op

---
 python/paddle/fluid/tests/unittests/test_recv_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py
index 854238c627..2ebceca7e4 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@@ -23,7 +23,7 @@ import time
 
 
 class TestRecvOp(unittest.TestCase):
-    def test_send(self):
+    def no_test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
         p = Process(target=self.init_serv, args=(place, ))

From 0b8534f2a456d786fdc7ef2f252409f0ac0bbca3 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Mon, 2 Apr 2018 23:35:54 +0800
Subject: [PATCH 40/57] Refine python wrapper for pad_op

---
 doc/fluid/api/layers.rst         |  6 ++++++
 python/paddle/fluid/layers/nn.py | 13 ++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index ae35d8c534..22e6fb13d7 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -494,6 +494,12 @@ reshape
 ..  autofunction:: paddle.fluid.layers.reshape
     :noindex:
 
+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
+    :noindex:
+
 scale
 -----
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f96bc6911f..3d13133bf2 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3380,6 +3380,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
 
     Examples:
         .. code-block:: python
+
             data = fluid.layers.data(
                 name='data', shape=[2, 4, 6], dtype='float32')
             reshaped = fluid.layers.reshape(
@@ -3585,12 +3586,13 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
 
 def pad(x, paddings, pad_value=0., name=None):
     """
-    Pads a tensor with a constant value given by :attr:pad_value, and the
-    padded width is specified by :attr:paddings. 
+    Pads a tensor with a constant value given by :attr:`pad_value`, and the
+    padded width is specified by :attr:`paddings`. 
 
-    Specifically, the number of values padded before each dimension 
-    :attr:i is indicated by :attr:paddings[i], and the number of values padded
-    after each dimension :attr:i is indicated by :attr:paddings[i+1].
+    Specifically, the number of values padded before the contents of :attr:`x`
+    in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
+    of values padded after the contents of :attr:`x` in dimension :attr:`i` is
+    indicated by :attr:`paddings[i+1]`.
 
     See below for an example.
 
@@ -3624,6 +3626,7 @@ def pad(x, paddings, pad_value=0., name=None):
 
     Examples:
         .. code-block:: python
+
             # x is a rank 2 tensor variable.
             out = fluid.layers.pad(
                 x=x, paddings=[0, 1, 1, 2], pad_value=0.)

From 9365d110b5ee789e95568f72ae7d627960e45c36 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 30 Mar 2018 17:27:19 -0700
Subject: [PATCH 41/57] temporaryly disable ncclBcastOp test, it fails randomly

---
 paddle/fluid/operators/nccl_op_test.cu.cc | 91 ++++++++++++-----------
 1 file changed, 46 insertions(+), 45 deletions(-)

diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index 90f6f955ce..7659bb9edd 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -236,48 +236,49 @@ TEST_F(NCCLTester, ncclReduceOp) {
 }
 
 // ncclBcastOp with desc
-TEST_F(NCCLTester, ncclBcastOp) {
-  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  const int kRoot = 0;
-  op2->SetType("ncclBcast");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", kRoot);
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope_.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
-                   *op2.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    ths[i].join();
-  }
-
-  const int idx = 1;
-  float result = GetGPUData(kRoot);
-
-  p::CPUPlace cpu_place;
-  p::CUDAPlace gpu_place(gpu_list_[idx]);
-
-  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
-  auto *rt = recv_tensor.data<float>();
-  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
-  result_tensor->Resize(kDims);
-  auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-  paddle::memory::Copy(
-      cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
-      recv_tensor.numel() * sizeof(float),
-      static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
-
-  for (int64_t j = 0; j < f::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], result, 1e-5);
-  }
-}
+// TODO(helin): enable the test for ncclBcastOp
+// TEST_F(NCCLTester, ncclBcastOp) {
+//   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
+//   const int kRoot = 0;
+//   op2->SetType("ncclBcast");
+//   op2->SetInput("X", {"st"});
+//   op2->SetInput("Communicator", {"comm"});
+//   op2->SetOutput("Out", {"rt"});
+//   op2->SetAttr("root", kRoot);
+
+//   std::vector<f::Scope *> dev_scopes;
+
+//   std::vector<std::thread> ths;
+
+//   for (size_t i = 0; i < gpu_list_.size(); ++i) {
+//     dev_scopes.emplace_back(&g_scope_.NewScope());
+//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
+//                    *op2.get(), dev_scopes[i]);
+//     ths.emplace_back(std::move(th));
+//   }
+
+//   for (size_t i = 0; i < gpu_list_.size(); ++i) {
+//     ths[i].join();
+//   }
+
+//   const int idx = 1;
+//   float result = GetGPUData(kRoot);
+
+//   p::CPUPlace cpu_place;
+//   p::CUDAPlace gpu_place(gpu_list_[idx]);
+
+//   auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
+//   auto *rt = recv_tensor.data<float>();
+//   auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
+//   result_tensor->Resize(kDims);
+//   auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+//   paddle::memory::Copy(
+//       cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
+//       recv_tensor.numel() * sizeof(float),
+//       static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
+
+//   for (int64_t j = 0; j < f::product(kDims); ++j) {
+//     ASSERT_NEAR(ct[j], result, 1e-5);
+//   }
+// }

From 9fbe90ef96e05fe10316ce2136e192423452ceab Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 30 Mar 2018 17:36:34 -0700
Subject: [PATCH 42/57] fix according to comments

---
 paddle/fluid/operators/nccl_op_test.cu.cc | 94 ++++++++++++-----------
 1 file changed, 48 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index 7659bb9edd..28f13c8052 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -236,49 +236,51 @@ TEST_F(NCCLTester, ncclReduceOp) {
 }
 
 // ncclBcastOp with desc
-// TODO(helin): enable the test for ncclBcastOp
-// TEST_F(NCCLTester, ncclBcastOp) {
-//   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-//   const int kRoot = 0;
-//   op2->SetType("ncclBcast");
-//   op2->SetInput("X", {"st"});
-//   op2->SetInput("Communicator", {"comm"});
-//   op2->SetOutput("Out", {"rt"});
-//   op2->SetAttr("root", kRoot);
-
-//   std::vector<f::Scope *> dev_scopes;
-
-//   std::vector<std::thread> ths;
-
-//   for (size_t i = 0; i < gpu_list_.size(); ++i) {
-//     dev_scopes.emplace_back(&g_scope_.NewScope());
-//     std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
-//                    *op2.get(), dev_scopes[i]);
-//     ths.emplace_back(std::move(th));
-//   }
-
-//   for (size_t i = 0; i < gpu_list_.size(); ++i) {
-//     ths[i].join();
-//   }
-
-//   const int idx = 1;
-//   float result = GetGPUData(kRoot);
-
-//   p::CPUPlace cpu_place;
-//   p::CUDAPlace gpu_place(gpu_list_[idx]);
-
-//   auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
-//   auto *rt = recv_tensor.data<float>();
-//   auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
-//   result_tensor->Resize(kDims);
-//   auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-//   paddle::memory::Copy(
-//       cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
-//       recv_tensor.numel() * sizeof(float),
-//       static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
-
-//   for (int64_t j = 0; j < f::product(kDims); ++j) {
-//     ASSERT_NEAR(ct[j], result, 1e-5);
-//   }
-// }
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
+/*
+TEST_F(NCCLTester, ncclBcastOp) {
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
+  const int kRoot = 0;
+  op2->SetType("ncclBcast");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", kRoot);
+
+  std::vector<f::Scope *> dev_scopes;
+
+  std::vector<std::thread> ths;
+
+  for (size_t i = 0; i < gpu_list_.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope_.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list_.size(); ++i) {
+    ths[i].join();
+  }
+
+  const int idx = 1;
+  float result = GetGPUData(kRoot);
+
+  p::CPUPlace cpu_place;
+  p::CUDAPlace gpu_place(gpu_list_[idx]);
+
+  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
+  auto *rt = recv_tensor.data<float>();
+  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
+  result_tensor->Resize(kDims);
+  auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+  paddle::memory::Copy(
+      cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
+      recv_tensor.numel() * sizeof(float),
+      static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
+
+  for (int64_t j = 0; j < f::product(kDims); ++j) {
+    ASSERT_NEAR(ct[j], result, 1e-5);
+  }
+}
+*/

From c4720376c692e14f7c089f6c3604448a31cc9de6 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Mon, 2 Apr 2018 09:59:32 -0700
Subject: [PATCH 43/57] disable ncclAllReduceOp as well

---
 paddle/fluid/operators/nccl_op_test.cu.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index 28f13c8052..a31d64e899 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -137,6 +137,8 @@ class NCCLTester : public ::testing::Test {
 TEST_F(NCCLTester, ncclInitOp) {}
 
 // ncclAllReduceOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
+/*
 TEST_F(NCCLTester, ncclAllReduceOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   op2->SetType("ncclAllReduce");
@@ -184,6 +186,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
     }
   }
 }
+*/
 
 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {

From f837eee724824a7f06025152400e70a1b9a2be53 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Mon, 2 Apr 2018 13:21:03 -0700
Subject: [PATCH 44/57] add paddle.v2.reader,dataset back for backward
 compatibility

---
 python/CMakeLists.txt                         |   1 +
 python/paddle/dataset/__init__.py             |   2 +-
 python/paddle/v2/__init__.py                  |   8 +
 python/paddle/v2/dataset/__init__.py          |  46 ++
 python/paddle/v2/dataset/cifar.py             | 139 ++++++
 python/paddle/v2/dataset/common.py            | 236 ++++++++++
 python/paddle/v2/dataset/conll05.py           | 257 +++++++++++
 python/paddle/v2/dataset/flowers.py           | 199 +++++++++
 python/paddle/v2/dataset/imdb.py              | 148 +++++++
 python/paddle/v2/dataset/imikolov.py          | 161 +++++++
 python/paddle/v2/dataset/mnist.py             | 123 ++++++
 python/paddle/v2/dataset/movielens.py         | 262 +++++++++++
 python/paddle/v2/dataset/mq2007.py            | 333 ++++++++++++++
 python/paddle/v2/dataset/sentiment.py         | 141 ++++++
 python/paddle/v2/dataset/tests/cifar_test.py  |  56 +++
 python/paddle/v2/dataset/tests/common_test.py |  94 ++++
 .../paddle/v2/dataset/tests/flowers_test.py   |  51 +++
 python/paddle/v2/dataset/tests/imdb_test.py   |  57 +++
 .../paddle/v2/dataset/tests/imikolov_test.py  |  67 +++
 python/paddle/v2/dataset/tests/mnist_test.py  |  44 ++
 python/paddle/v2/dataset/tests/mq2007_test.py |  33 ++
 .../paddle/v2/dataset/tests/test_sentiment.py |  55 +++
 .../paddle/v2/dataset/tests/voc2012_test.py   |  42 ++
 python/paddle/v2/dataset/tests/wmt16_test.py  |  66 +++
 python/paddle/v2/dataset/uci_housing.py       | 134 ++++++
 python/paddle/v2/dataset/voc2012.py           |  85 ++++
 python/paddle/v2/dataset/wmt14.py             | 182 ++++++++
 python/paddle/v2/dataset/wmt16.py             | 349 +++++++++++++++
 python/paddle/v2/image.py                     | 381 ++++++++++++++++
 python/paddle/v2/minibatch.py                 |  41 ++
 python/paddle/v2/reader/__init__.py           |  74 ++++
 python/paddle/v2/reader/creator.py            | 130 ++++++
 python/paddle/v2/reader/decorator.py          | 405 ++++++++++++++++++
 python/paddle/v2/reader/tests/CMakeLists.txt  |   2 +
 python/paddle/v2/reader/tests/__init__.py     |  13 +
 python/paddle/v2/reader/tests/creator_test.py |  74 ++++
 .../paddle/v2/reader/tests/decorator_test.py  | 178 ++++++++
 .../v2/reader/tests/test_data_creator.txt     |   3 +
 .../v2/reader/tests/test_reader_recordio.dat  | Bin 0 -> 76 bytes
 .../v2/reader/tests/test_recordio_creator.dat | Bin 0 -> 88 bytes
 python/paddle/v2/tests/CMakeLists.txt         |   1 +
 python/paddle/v2/tests/cat.jpg                | Bin 0 -> 57218 bytes
 python/paddle/v2/tests/test_image.py          |  43 ++
 .../paddle/v2/tests/test_paramconf_order.py   |   3 +-
 python/setup.py.in                            |   2 +
 45 files changed, 4718 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/v2/dataset/__init__.py
 create mode 100644 python/paddle/v2/dataset/cifar.py
 create mode 100644 python/paddle/v2/dataset/common.py
 create mode 100644 python/paddle/v2/dataset/conll05.py
 create mode 100644 python/paddle/v2/dataset/flowers.py
 create mode 100644 python/paddle/v2/dataset/imdb.py
 create mode 100644 python/paddle/v2/dataset/imikolov.py
 create mode 100644 python/paddle/v2/dataset/mnist.py
 create mode 100644 python/paddle/v2/dataset/movielens.py
 create mode 100644 python/paddle/v2/dataset/mq2007.py
 create mode 100644 python/paddle/v2/dataset/sentiment.py
 create mode 100644 python/paddle/v2/dataset/tests/cifar_test.py
 create mode 100644 python/paddle/v2/dataset/tests/common_test.py
 create mode 100644 python/paddle/v2/dataset/tests/flowers_test.py
 create mode 100644 python/paddle/v2/dataset/tests/imdb_test.py
 create mode 100644 python/paddle/v2/dataset/tests/imikolov_test.py
 create mode 100644 python/paddle/v2/dataset/tests/mnist_test.py
 create mode 100644 python/paddle/v2/dataset/tests/mq2007_test.py
 create mode 100644 python/paddle/v2/dataset/tests/test_sentiment.py
 create mode 100644 python/paddle/v2/dataset/tests/voc2012_test.py
 create mode 100644 python/paddle/v2/dataset/tests/wmt16_test.py
 create mode 100644 python/paddle/v2/dataset/uci_housing.py
 create mode 100644 python/paddle/v2/dataset/voc2012.py
 create mode 100644 python/paddle/v2/dataset/wmt14.py
 create mode 100644 python/paddle/v2/dataset/wmt16.py
 create mode 100644 python/paddle/v2/image.py
 create mode 100644 python/paddle/v2/minibatch.py
 create mode 100644 python/paddle/v2/reader/__init__.py
 create mode 100644 python/paddle/v2/reader/creator.py
 create mode 100644 python/paddle/v2/reader/decorator.py
 create mode 100644 python/paddle/v2/reader/tests/CMakeLists.txt
 create mode 100644 python/paddle/v2/reader/tests/__init__.py
 create mode 100644 python/paddle/v2/reader/tests/creator_test.py
 create mode 100644 python/paddle/v2/reader/tests/decorator_test.py
 create mode 100644 python/paddle/v2/reader/tests/test_data_creator.txt
 create mode 100644 python/paddle/v2/reader/tests/test_reader_recordio.dat
 create mode 100644 python/paddle/v2/reader/tests/test_recordio_creator.dat
 create mode 100644 python/paddle/v2/tests/cat.jpg
 create mode 100644 python/paddle/v2/tests/test_image.py

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index f5ae553c85..d074b0136d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -81,6 +81,7 @@ if (WITH_TESTING)
       # enable v2 API unittest only when paddle swig api is compiled
       add_subdirectory(paddle/v2/tests)
       add_subdirectory(paddle/v2/plot/tests)
+      add_subdirectory(paddle/v2/reader/tests)
     endif()
   endif()
   add_subdirectory(paddle/fluid/tests)
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
index 1fdfd49f1c..3315e826e8 100644
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -37,7 +37,7 @@ __all__ = [
     'cifar',
     'movielens',
     'conll05',
-    'sentiment'
+    'sentiment',
     'uci_housing',
     'wmt14',
     'wmt16',
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 02b0d077ee..df710c33d0 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -22,13 +22,17 @@ import data_type
 import topology
 import networks
 import evaluator
+from . import dataset
+from . import reader
 from . import plot
 import attr
 import op
 import pooling
 import inference
 import networks
+import minibatch
 import plot
+import image
 import paddle.trainer.config_parser as cp
 
 __all__ = [
@@ -44,11 +48,14 @@ __all__ = [
     'data_type',
     'attr',
     'pooling',
+    'dataset',
+    'reader',
     'topology',
     'networks',
     'infer',
     'plot',
     'evaluator',
+    'image',
     'master',
 ]
 
@@ -146,3 +153,4 @@ def init(**kwargs):
 
 
 infer = inference.infer
+batch = minibatch.batch
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
new file mode 100644
index 0000000000..c1acbecd9c
--- /dev/null
+++ b/python/paddle/v2/dataset/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Dataset package.
+"""
+
+import mnist
+import imikolov
+import imdb
+import cifar
+import movielens
+import conll05
+import uci_housing
+import sentiment
+import wmt14
+import wmt16
+import mq2007
+import flowers
+import voc2012
+
+__all__ = [
+    'mnist',
+    'imikolov',
+    'imdb',
+    'cifar',
+    'movielens',
+    'conll05',
+    'sentiment'
+    'uci_housing',
+    'wmt14',
+    'wmt16',
+    'mq2007',
+    'flowers',
+    'voc2012',
+]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
new file mode 100644
index 0000000000..0a2a1ced11
--- /dev/null
+++ b/python/paddle/v2/dataset/cifar.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+
+def reader_creator(filename, sub_name):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    yield item
+
+    return reader
+
+
+def train100():
+    """
+    CIFAR-100 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 99].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
+
+
+def test100():
+    """
+    CIFAR-100 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
+
+
+def train10():
+    """
+    CIFAR-10 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
+
+
+def test10():
+    """
+    CIFAR-10 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
+
+
+def fetch():
+    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+    paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+    paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+    paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
new file mode 100644
index 0000000000..c6ff09a1d1
--- /dev/null
+++ b/python/paddle/v2/dataset/common.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import hashlib
+import os
+import errno
+import shutil
+import sys
+import importlib
+import paddle.v2.dataset
+import cPickle
+import glob
+import cPickle as pickle
+
+__all__ = [
+    'DATA_HOME',
+    'download',
+    'md5file',
+    'split',
+    'cluster_files_reader',
+    'convert',
+]
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
+
+
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+def must_mkdirs(path):
+    try:
+        os.makedirs(DATA_HOME)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
+
+
+must_mkdirs(DATA_HOME)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
+def download(url, module_name, md5sum, save_name=None):
+    dirname = os.path.join(DATA_HOME, module_name)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    filename = os.path.join(dirname,
+                            url.split('/')[-1]
+                            if save_name is None else save_name)
+
+    retry = 0
+    retry_limit = 3
+    while not (os.path.exists(filename) and md5file(filename) == md5sum):
+        if os.path.exists(filename):
+            print "file md5", md5file(filename), md5sum
+        if retry < retry_limit:
+            retry += 1
+        else:
+            raise RuntimeError("Cannot download {0} within retry limit {1}".
+                               format(url, retry_limit))
+        print "Cache file %s not found, downloading %s" % (filename, url)
+        r = requests.get(url, stream=True)
+        total_length = r.headers.get('content-length')
+
+        if total_length is None:
+            with open(filename, 'w') as f:
+                shutil.copyfileobj(r.raw, f)
+        else:
+            with open(filename, 'w') as f:
+                dl = 0
+                total_length = int(total_length)
+                for data in r.iter_content(chunk_size=4096):
+                    dl += len(data)
+                    f.write(data)
+                    done = int(50 * dl / total_length)
+                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                                                   ' ' * (50 - done)))
+                    sys.stdout.flush()
+
+    return filename
+
+
+def fetch_all():
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "fetch" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "fetch")()
+
+
+def fetch_all_recordio(path):
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "convert" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
+                not module_name == "common":
+            ds_path = os.path.join(path, module_name)
+            must_mkdirs(ds_path)
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "convert")(ds_path)
+
+
+def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+    """
+    you can call the function as:
+
+    split(paddle.v2.dataset.cifar.train10(), line_count=1000,
+        suffix="imikolov-train-%05d.pickle")
+
+    the output files as:
+
+    |-imikolov-train-00000.pickle
+    |-imikolov-train-00001.pickle
+    |- ...
+    |-imikolov-train-00480.pickle
+
+    :param reader: is a reader creator
+    :param line_count: line count for each file
+    :param suffix: the suffix for the output files, should contain "%d"
+                means the id for each file. Default is "%05d.pickle"
+    :param dumper: is a callable function that dump object to file, this
+                function will be called as dumper(obj, f) and obj is the object
+                will be dumped, f is a file object. Default is cPickle.dump.
+    """
+    if not callable(dumper):
+        raise TypeError("dumper should be callable.")
+    lines = []
+    indx_f = 0
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i >= line_count and i % line_count == 0:
+            with open(suffix % indx_f, "w") as f:
+                dumper(lines, f)
+                lines = []
+                indx_f += 1
+    if lines:
+        with open(suffix % indx_f, "w") as f:
+            dumper(lines, f)
+
+
+def cluster_files_reader(files_pattern,
+                         trainer_count,
+                         trainer_id,
+                         loader=cPickle.load):
+    """
+    Create a reader that yield element from the given files, select
+    a file set according trainer count and trainer_id
+
+    :param files_pattern: the files which generating by split(...)
+    :param trainer_count: total trainer count
+    :param trainer_id: the trainer rank id
+    :param loader: is a callable function that load object from file, this
+                function will be called as loader(f) and f is a file object.
+                Default is cPickle.load
+    """
+
+    def reader():
+        if not callable(loader):
+            raise TypeError("loader should be callable.")
+        file_list = glob.glob(files_pattern)
+        file_list.sort()
+        my_file_list = []
+        for idx, fn in enumerate(file_list):
+            if idx % trainer_count == trainer_id:
+                print "append file: %s" % fn
+                my_file_list.append(fn)
+        for fn in my_file_list:
+            with open(fn, "r") as f:
+                lines = loader(f)
+                for line in lines:
+                    yield line
+
+    return reader
+
+
+def convert(output_path, reader, line_count, name_prefix):
+    import recordio
+    """
+    Convert data from reader to recordio format files.
+
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read
+                   data instances.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before
+                                 writing.
+    """
+
+    assert line_count >= 1
+    indx_f = 0
+
+    def write_data(indx_f, lines):
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            writer.write(cPickle.dumps(l))
+        writer.close()
+
+    lines = []
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
+            lines = []
+            indx_f += 1
+            continue
+
+    write_data(indx_f, lines)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
new file mode 100644
index 0000000000..0d544efac9
--- /dev/null
+++ b/python/paddle/v2/dataset/conll05.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Conll05 dataset.
+Paddle semantic role labeling Book and demo use this dataset as an example.
+Because Conll05 is not free in public, the default downloaded URL is test set
+of Conll05 (which is public). Users can change URL and MD5 to their Conll
+dataset. And a pre-trained word vector model based on Wikipedia corpus is used
+to initialize SRL model.
+"""
+
+import tarfile
+import gzip
+import itertools
+import paddle.v2.dataset.common
+
+__all__ = ['test, get_dict', 'get_embedding', 'convert']
+
+DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+def load_label_dict(filename):
+    d = dict()
+    tag_dict = set()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if line.startswith("B-"):
+                tag_dict.add(line[2:])
+            elif line.startswith("I-"):
+                tag_dict.add(line[2:])
+        index = 0
+        for tag in tag_dict:
+            d["B-" + tag] = index
+            index += 1
+            d["I-" + tag] = index
+            index += 1
+        d["O"] = index
+    return d
+
+
+def load_dict(filename):
+    d = dict()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            d[line.strip()] = i
+    return d
+
+
+def corpus_reader(data_path, words_name, props_name):
+    """
+    Read one corpus. It returns an iterator. Each element of
+    this iterator is a tuple including sentence and labels. The sentence is
+    consist of a list of word IDs. The labels include a list of label IDs.
+    :return: a iterator of data.
+    :rtype: iterator
+    """
+
+    def reader():
+        tf = tarfile.open(data_path)
+        wf = tf.extractfile(words_name)
+        pf = tf.extractfile(props_name)
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in itertools.izip(words_file, props_file):
+                word = word.strip()
+                label = label.strip().split()
+
+                if len(label) == 0:  # end of sentence
+                    for i in xrange(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            yield sentences, verb_list[i], lbl_seq
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    return reader
+
+
+def reader_creator(corpus_reader,
+                   word_dict=None,
+                   predicate_dict=None,
+                   label_dict=None):
+    def reader():
+        for sentence, predicate, labels in corpus_reader():
+
+            sen_len = len(sentence)
+
+            verb_index = labels.index('B-V')
+            mark = [0] * len(labels)
+            if verb_index > 0:
+                mark[verb_index - 1] = 1
+                ctx_n1 = sentence[verb_index - 1]
+            else:
+                ctx_n1 = 'bos'
+
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
+
+            mark[verb_index] = 1
+            ctx_0 = sentence[verb_index]
+
+            if verb_index < len(labels) - 1:
+                mark[verb_index + 1] = 1
+                ctx_p1 = sentence[verb_index + 1]
+            else:
+                ctx_p1 = 'eos'
+
+            if verb_index < len(labels) - 2:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
+            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
+
+            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            pred_idx = [predicate_dict.get(predicate)] * sen_len
+            label_idx = [label_dict.get(w) for w in labels]
+
+            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
+              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
+
+    return reader
+
+
+def get_dict():
+    """
+    Get the word, verb and label dictionary of Wikipedia corpus.
+    """
+    word_dict = load_dict(
+        paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
+                                          WORDDICT_MD5))
+    verb_dict = load_dict(
+        paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
+                                          VERBDICT_MD5))
+    label_dict = load_label_dict(
+        paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
+                                          TRGDICT_MD5))
+    return word_dict, verb_dict, label_dict
+
+
+def get_embedding():
+    """
+    Get the trained word vector based on Wikipedia corpus.
+    """
+    return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+
+
+def test():
+    """
+    Conll05 test set creator.
+
+    Because the training dataset is not free, the test dataset is used for
+    training. It returns a reader creator, each sample in the reader is nine
+    features, including sentence sequence, predicate, predicate context,
+    predicate context flag and tagged sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    word_dict, verb_dict, label_dict = get_dict()
+    reader = corpus_reader(
+        paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
+        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
+        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
+    return reader_creator(reader, word_dict, verb_dict, label_dict)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
new file mode 100644
index 0000000000..7bdddeaabe
--- /dev/null
+++ b/python/paddle/v2/dataset/flowers.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module will download dataset from
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
+and parse train/test set intopaddle reader creators.
+
+This set contains images of flowers belonging to 102 different categories.
+The images were acquired by searching the web and taking pictures. There are a
+minimum of 40 images for each category.
+
+The database was used in:
+
+Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
+ number of classes.Proceedings of the Indian Conference on Computer Vision,
+Graphics and Image Processing (2008)
+http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
+
+"""
+import cPickle
+import itertools
+import functools
+from common import download
+import tarfile
+import scipy.io as scio
+from paddle.v2.image import *
+from paddle.v2.reader import *
+import os
+import numpy as np
+from multiprocessing import cpu_count
+__all__ = ['train', 'test', 'valid']
+
+DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
+LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
+SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+TRAIN_FLAG = 'tstid'
+TEST_FLAG = 'trnid'
+VALID_FLAG = 'valid'
+
+
+def default_mapper(is_train, sample):
+    '''
+    map image bytes data to type needed by model input layer
+    '''
+    img, label = sample
+    img = load_image_bytes(img)
+    img = simple_transform(
+        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+
+
+train_mapper = functools.partial(default_mapper, True)
+test_mapper = functools.partial(default_mapper, False)
+
+
+def reader_creator(data_file,
+                   label_file,
+                   setid_file,
+                   dataset_name,
+                   mapper,
+                   buffered_size=1024,
+                   use_xmap=True):
+    '''
+    1. read images from tar file and
+        merge images into batch files in 102flowers.tgz_batch/
+    2. get a reader to read sample from batch file
+
+    :param data_file: downloaded data file
+    :type data_file: string
+    :param label_file: downloaded label file
+    :type label_file: string
+    :param setid_file: downloaded setid file containing information
+                        about how to split dataset
+    :type setid_file: string
+    :param dataset_name: data set name (tstid|trnid|valid)
+    :type dataset_name: string
+    :param mapper: a function to map image bytes data to type
+                    needed by model input layer
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: data reader
+    :rtype: callable
+    '''
+    labels = scio.loadmat(label_file)['labels'][0]
+    indexes = scio.loadmat(setid_file)[dataset_name][0]
+    img2label = {}
+    for i in indexes:
+        img = "jpg/image_%05d.jpg" % i
+        img2label[img] = labels[i - 1]
+    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
+
+    def reader():
+        for file in open(file_list):
+            file = file.strip()
+            batch = None
+            with open(file, 'r') as f:
+                batch = cPickle.load(f)
+            data = batch['data']
+            labels = batch['label']
+            for sample, label in itertools.izip(data, batch['label']):
+                yield sample, int(label) - 1
+
+    if use_xmap:
+        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+    else:
+        return map_readers(mapper, reader)
+
+
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers training set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: train data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers test set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers validation set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def fetch():
+    download(DATA_URL, 'flowers', DATA_MD5)
+    download(LABEL_URL, 'flowers', LABEL_MD5)
+    download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
new file mode 100644
index 0000000000..37c4296f9b
--- /dev/null
+++ b/python/paddle/v2/dataset/imdb.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMDB dataset.
+
+This module downloads IMDB dataset from
+http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
+of 25,000 highly polar movie reviews for training, and 25,000 for testing.
+Besides, this module also provides API for building dictionary.
+"""
+
+import paddle.v2.dataset.common
+import collections
+import tarfile
+import re
+import string
+
+__all__ = ['build_dict', 'train', 'test', 'convert']
+
+URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+def tokenize(pattern):
+    """
+    Read files that match the given pattern.  Tokenize and yield each file.
+    """
+
+    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
+                                                        MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+                    None, string.punctuation).lower().split()
+            tf = tarf.next()
+
+
+def build_dict(pattern, cutoff):
+    """
+    Build a word dictionary from the corpus. Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    word_freq = collections.defaultdict(int)
+    for doc in tokenize(pattern):
+        for word in doc:
+            word_freq[word] += 1
+
+    # Not sure if we should prune less-frequent words here.
+    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+
+    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*dictionary))
+    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx['<unk>'] = len(words)
+    return word_idx
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx):
+    UNK = word_idx['<unk>']
+    INS = []
+
+    def load(pattern, out, label):
+        for doc in tokenize(pattern):
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
+
+    def reader():
+        for doc, label in INS:
+            yield doc, label
+
+    return reader
+
+
+def train(word_idx):
+    """
+    IMDB training set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/train/pos/.*\.txt$"),
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
+
+
+def test(word_idx):
+    """
+    IMDB test set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/test/pos/.*\.txt$"),
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
+
+
+def word_dict():
+    """
+    Build a word dictionary from the corpus.
+
+    :return: Word dictionary
+    :rtype: dict
+    """
+    return build_dict(
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    w = word_dict()
+    paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+    paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
new file mode 100644
index 0000000000..617c722c41
--- /dev/null
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+imikolov's simple dataset.
+
+This module will download dataset from 
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
+into paddle reader creators.
+"""
+import paddle.v2.dataset.common
+import collections
+import tarfile
+
+__all__ = ['train', 'test', 'build_dict', 'convert']
+
+URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+class DataType(object):
+    NGRAM = 1
+    SEQ = 2
+
+
+def word_count(f, word_freq=None):
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
+
+    for l in f:
+        for w in l.strip().split():
+            word_freq[w] += 1
+        word_freq['<s>'] += 1
+        word_freq['<e>'] += 1
+
+    return word_freq
+
+
+def build_dict(min_word_freq=50):
+    """
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    train_filename = './simple-examples/data/ptb.train.txt'
+    test_filename = './simple-examples/data/ptb.valid.txt'
+    with tarfile.open(
+            paddle.v2.dataset.common.download(
+                paddle.v2.dataset.imikolov.URL, 'imikolov',
+                paddle.v2.dataset.imikolov.MD5)) as tf:
+        trainf = tf.extractfile(train_filename)
+        testf = tf.extractfile(test_filename)
+        word_freq = word_count(testf, word_count(trainf))
+        if '<unk>' in word_freq:
+            # remove <unk> for now, since we will set it as last index
+            del word_freq['<unk>']
+
+        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+
+        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*word_freq_sorted))
+        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx['<unk>'] = len(words)
+
+    return word_idx
+
+
+def reader_creator(filename, word_idx, n, data_type):
+    def reader():
+        with tarfile.open(
+                paddle.v2.dataset.common.download(
+                    paddle.v2.dataset.imikolov.URL, 'imikolov',
+                    paddle.v2.dataset.imikolov.MD5)) as tf:
+            f = tf.extractfile(filename)
+
+            UNK = word_idx['<unk>']
+            for l in f:
+                if DataType.NGRAM == data_type:
+                    assert n > -1, 'Invalid gram length'
+                    l = ['<s>'] + l.strip().split() + ['<e>']
+                    if len(l) >= n:
+                        l = [word_idx.get(w, UNK) for w in l]
+                        for i in range(n, len(l) + 1):
+                            yield tuple(l[i - n:i])
+                elif DataType.SEQ == data_type:
+                    l = l.strip().split()
+                    l = [word_idx.get(w, UNK) for w in l]
+                    src_seq = [word_idx['<s>']] + l
+                    trg_seq = l + [word_idx['<e>']]
+                    if n > 0 and len(src_seq) > n: continue
+                    yield src_seq, trg_seq
+                else:
+                    assert False, 'Unknow data type'
+
+    return reader
+
+
+def train(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov training set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
+                          data_type)
+
+
+def test(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov test set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
+                          data_type)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    N = 5
+    word_dict = build_dict()
+    paddle.v2.dataset.common.convert(path,
+                                     train(word_dict, N), 1000,
+                                     "imikolov_train")
+    paddle.v2.dataset.common.convert(path,
+                                     test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
new file mode 100644
index 0000000000..9f675bed89
--- /dev/null
+++ b/python/paddle/v2/dataset/mnist.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MNIST dataset.
+
+This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
+parse training set and test set into paddle reader creators.
+"""
+import paddle.v2.dataset.common
+import subprocess
+import numpy
+import platform
+__all__ = ['train', 'test', 'convert']
+
+URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+
+
+def reader_creator(image_filename, label_filename, buffer_size):
+    def reader():
+        if platform.system() == 'Darwin':
+            zcat_cmd = 'gzcat'
+        elif platform.system() == 'Linux':
+            zcat_cmd = 'zcat'
+        else:
+            raise NotImplementedError()
+
+        # According to http://stackoverflow.com/a/38061619/724872, we
+        # cannot use standard package gzip here.
+        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
+        m.stdout.read(16)  # skip some magic bytes
+
+        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
+        l.stdout.read(8)  # skip some magic bytes
+
+        try:  # reader could be break.
+            while True:
+                labels = numpy.fromfile(
+                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+
+                if labels.size != buffer_size:
+                    break  # numpy.fromfile returns empty slice after EOF.
+
+                images = numpy.fromfile(
+                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
+                        (buffer_size, 28 * 28)).astype('float32')
+
+                images = images / 255.0 * 2.0 - 1.0
+
+                for i in xrange(buffer_size):
+                    yield images[i, :], int(labels[i])
+        finally:
+            m.terminate()
+            l.terminate()
+
+    return reader
+
+
+def train():
+    """
+    MNIST training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
+                                          TRAIN_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
+                                          TRAIN_LABEL_MD5), 100)
+
+
+def test():
+    """
+    MNIST test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
+                                          TEST_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
+                                          TEST_LABEL_MD5), 100)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
new file mode 100644
index 0000000000..5b61a9420a
--- /dev/null
+++ b/python/paddle/v2/dataset/movielens.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Movielens 1-M dataset.
+
+Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
+movies, which was collected by GroupLens Research. This module will download
+Movielens 1-M dataset from 
+http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
+set and test set into paddle reader creators.
+
+"""
+
+import zipfile
+import paddle.v2.dataset.common
+import re
+import random
+import functools
+
+__all__ = [
+    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
+    'convert'
+]
+
+age_table = [1, 18, 25, 35, 45, 50, 56]
+
+URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
+
+class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
+
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self):
+        """
+        Get information from a movie.
+        """
+        return [
+            self.index, [CATEGORIES_DICT[c] for c in self.categories],
+            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
+        ]
+
+    def __str__(self):
+        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
+            self.index, self.title, self.categories)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
+
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = age_table.index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        """
+        Get information from a user.
+        """
+        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
+
+    def __str__(self):
+        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
+            self.index, "M"
+            if self.is_male else "F", age_table[self.age], self.job_id)
+
+    def __repr__(self):
+        return str(self)
+
+
+MOVIE_INFO = None
+MOVIE_TITLE_DICT = None
+CATEGORIES_DICT = None
+USER_INFO = None
+
+
+def __initialize_meta_info__():
+    fn = paddle.v2.dataset.common.download(URL, "movielens", MD5)
+    global MOVIE_INFO
+    if MOVIE_INFO is None:
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        with zipfile.ZipFile(file=fn) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                MOVIE_INFO = dict()
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        MOVIE_INFO[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                global MOVIE_TITLE_DICT
+                MOVIE_TITLE_DICT = dict()
+                for i, w in enumerate(title_word_set):
+                    MOVIE_TITLE_DICT[w] = i
+
+                global CATEGORIES_DICT
+                CATEGORIES_DICT = dict()
+                for i, c in enumerate(categories_set):
+                    CATEGORIES_DICT[c] = i
+
+                global USER_INFO
+                USER_INFO = dict()
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        USER_INFO[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+    return fn
+
+
+def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
+    fn = __initialize_meta_info__()
+    rand = random.Random(x=rand_seed)
+    with zipfile.ZipFile(file=fn) as package:
+        with package.open('ml-1m/ratings.dat') as rating:
+            for line in rating:
+                if (rand.random() < test_ratio) == is_test:
+                    uid, mov_id, rating, _ = line.strip().split("::")
+                    uid = int(uid)
+                    mov_id = int(mov_id)
+                    rating = float(rating) * 2 - 5.0
+
+                    mov = MOVIE_INFO[mov_id]
+                    usr = USER_INFO[uid]
+                    yield usr.value() + mov.value() + [[rating]]
+
+
+def __reader_creator__(**kwargs):
+    return lambda: __reader__(**kwargs)
+
+
+train = functools.partial(__reader_creator__, is_test=False)
+test = functools.partial(__reader_creator__, is_test=True)
+
+
+def get_movie_title_dict():
+    """
+    Get movie title dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_TITLE_DICT
+
+
+def __max_index_info__(a, b):
+    if a.index > b.index:
+        return a
+    else:
+        return b
+
+
+def max_movie_id():
+    """
+    Get the maximum value of movie id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+
+
+def max_user_id():
+    """
+    Get the maximum value of user id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+
+
+def __max_job_id_impl__(a, b):
+    if a.job_id > b.job_id:
+        return a
+    else:
+        return b
+
+
+def max_job_id():
+    """
+    Get the maximum value of job id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+
+
+def movie_categories():
+    """
+    Get movie categoriges dictionary.
+    """
+    __initialize_meta_info__()
+    return CATEGORIES_DICT
+
+
+def user_info():
+    """
+    Get user info dictionary.
+    """
+    __initialize_meta_info__()
+    return USER_INFO
+
+
+def movie_info():
+    """
+    Get movie info dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_INFO
+
+
+def unittest():
+    for train_count, _ in enumerate(train()()):
+        pass
+    for test_count, _ in enumerate(test()()):
+        pass
+
+    print train_count, test_count
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, "movielens", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
+
+
+if __name__ == '__main__':
+    unittest()
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
new file mode 100644
index 0000000000..d3b3dd524c
--- /dev/null
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MQ2007 dataset
+
+MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
+validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
+validation set and testing set.
+
+MQ2007 dataset from website
+http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
+
+"""
+
+import os
+import functools
+import rarfile
+from common import download
+import numpy as np
+
+# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
+URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
+MD5 = "7be1640ae95c6408dab0ae7207bdc706"
+
+
+def __initialize_meta_info__():
+    """
+  download and extract the MQ2007 dataset
+  """
+    fn = fetch()
+    rar = rarfile.RarFile(fn)
+    dirpath = os.path.dirname(fn)
+    rar.extractall(path=dirpath)
+    return dirpath
+
+
+class Query(object):
+    """
+  queries used for learning to rank algorithms. It is created from relevance scores,  query-document feature vectors
+
+  Parameters:
+  ----------
+  query_id : int
+    query_id in dataset, mapping from query to relevance documents
+  relevance_score : int 
+    relevance score of query and document pair
+  feature_vector : array, dense feature
+    feature in vector format
+  description : string
+    comment section in query doc pair data
+  """
+
+    def __init__(self,
+                 query_id=-1,
+                 relevance_score=-1,
+                 feature_vector=None,
+                 description=""):
+        self.query_id = query_id
+        self.relevance_score = relevance_score
+        if feature_vector is None:
+            self.feature_vector = []
+        else:
+            self.feature_vector = feature_vector
+        self.description = description
+
+    def __str__(self):
+        string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
+                               " ".join(str(f) for f in self.feature_vector))
+        return string
+
+    # @classmethod
+    def _parse_(self, text):
+        """
+    parse line into Query
+    """
+        comment_position = text.find('#')
+        line = text[:comment_position].strip()
+        self.description = text[comment_position + 1:].strip()
+        parts = line.split()
+        if len(parts) != 48:
+            sys.stdout.write("expect 48 space split parts, get %d" %
+                             (len(parts)))
+            return None
+        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        self.relevance_score = int(parts[0])
+        self.query_id = int(parts[1].split(':')[1])
+        for p in parts[2:]:
+            pair = p.split(':')
+            self.feature_vector.append(float(pair[1]))
+        return self
+
+
+class QueryList(object):
+    """
+  group query into list, every item in list is a Query
+  """
+
+    def __init__(self, querylist=None):
+        self.query_id = -1
+        if querylist is None:
+            self.querylist = []
+        else:
+            self.querylist = querylist
+            for query in self.querylist:
+                if self.query_id == -1:
+                    self.query_id = query.query_id
+                else:
+                    if self.query_id != query.query_id:
+                        raise ValueError("query in list must be same query_id")
+
+    def __iter__(self):
+        for query in self.querylist:
+            yield query
+
+    def __len__(self):
+        return len(self.querylist)
+
+    def __getitem__(self, i):
+        return self.querylist[i]
+
+    def _correct_ranking_(self):
+        if self.querylist is None:
+            return
+        self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
+
+    def _add_query(self, query):
+        if self.query_id == -1:
+            self.query_id = query.query_id
+        else:
+            if self.query_id != query.query_id:
+                raise ValueError("query in list must be same query_id")
+        self.querylist.append(query)
+
+
+def gen_plain_txt(querylist):
+    """
+  gen plain text in list for other usage
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  query_id : np.array, shape=(samples_num, )
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+    """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield querylist.query_id, query.relevance_score, np.array(
+            query.feature_vector)
+
+
+def gen_point(querylist):
+    """
+  gen item in list for point-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield query.relevance_score, np.array(query.feature_vector)
+
+
+def gen_pair(querylist, partial_order="full"):
+    """
+  gen pair for pair-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+  pairtial_order : "full" or "neighbour"
+    there is redudant in all possiable pair combinations, which can be simplifed
+  gen pairs for neighbour items or the full partial order pairs
+
+  return :
+  ------
+  label : np.array, shape=(1)
+  query_left : np.array, shape=(1, feature_dimension)
+  query_right : same as left
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    labels = []
+    docpairs = []
+
+    # C(n,2)
+    for i in range(len(querylist)):
+        query_left = querylist[i]
+        for j in range(i + 1, len(querylist)):
+            query_right = querylist[j]
+            if query_left.relevance_score > query_right.relevance_score:
+                labels.append([1])
+                docpairs.append([
+                    np.array(query_left.feature_vector),
+                    np.array(query_right.feature_vector)
+                ])
+            elif query_left.relevance_score < query_right.relevance_score:
+                labels.append([1])
+                docpairs.append([
+                    np.array(query_right.feature_vector),
+                    np.array(query_left.feature_vector)
+                ])
+    for label, pair in zip(labels, docpairs):
+        yield np.array(label), pair[0], pair[1]
+
+
+def gen_list(querylist):
+    """
+  gen item in list for list-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    relevance_score_list = [[query.relevance_score] for query in querylist]
+    feature_vector_list = [query.feature_vector for query in querylist]
+    yield np.array(relevance_score_list), np.array(feature_vector_list)
+
+
+def query_filter(querylists):
+    """
+    filter query get only document with label 0.
+    label 0, 1, 2 means the relevance score document with query
+    parameters :
+      querylist : QueyList list
+
+    return :
+      querylist : QueyList list
+    """
+    filter_query = []
+    for querylist in querylists:
+        relevance_score_list = [query.relevance_score for query in querylist]
+        if sum(relevance_score_list) != .0:
+            filter_query.append(querylist)
+    return filter_query
+
+
+def load_from_text(filepath, shuffle=False, fill_missing=-1):
+    """
+  parse data file into querys
+  """
+    prev_query_id = -1
+    querylists = []
+    querylist = None
+    fn = __initialize_meta_info__()
+    with open(os.path.join(fn, filepath)) as f:
+        for line in f:
+            query = Query()
+            query = query._parse_(line)
+            if query == None:
+                continue
+            if query.query_id != prev_query_id:
+                if querylist is not None:
+                    querylists.append(querylist)
+                querylist = QueryList()
+                prev_query_id = query.query_id
+            querylist._add_query(query)
+    if querylist is not None:
+        querylists.append(querylist)
+    return querylists
+
+
+def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
+    """
+  Parameters
+  --------
+  filename : string
+  fill_missing : fill the missing value. default in MQ2007 is -1
+  
+  Returns
+  ------
+  yield
+    label query_left, query_right  # format = "pairwise"
+    label querylist # format = "listwise"
+  """
+    querylists = query_filter(
+        load_from_text(
+            filepath, shuffle=shuffle, fill_missing=fill_missing))
+    for querylist in querylists:
+        if format == "plain_txt":
+            yield next(gen_plain_txt(querylist))
+        elif format == "pointwise":
+            yield next(gen_point(querylist))
+        elif format == "pairwise":
+            for pair in gen_pair(querylist):
+                yield pair
+        elif format == "listwise":
+            yield next(gen_list(querylist))
+
+
+train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
+test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
+
+
+def fetch():
+    return download(URL, "MQ2007", MD5)
+
+
+if __name__ == "__main__":
+    fetch()
+    mytest = functools.partial(
+        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
+    for label, query in mytest():
+        print label, query
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
new file mode 100644
index 0000000000..b0b9757c1a
--- /dev/null
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -0,0 +1,141 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The script fetch and preprocess movie_reviews data set that provided by NLTK
+
+TODO(yuyang18): Complete dataset.
+"""
+
+import collections
+from itertools import chain
+
+import nltk
+from nltk.corpus import movie_reviews
+
+import paddle.v2.dataset.common
+
+__all__ = ['train', 'test', 'get_word_dict', 'convert']
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+def download_data_if_not_yet():
+    """
+    Download the data set, if the data set is not download.
+    """
+    try:
+        # make sure that nltk can find the data
+        if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
+        movie_reviews.categories()
+    except LookupError:
+        print "Downloading movie_reviews data set, please wait....."
+        nltk.download(
+            'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
+        print "Download data set success....."
+        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+
+
+def get_word_dict():
+    """
+    Sorted the words by the frequency of words which occur in sample
+    :return:
+        words_freq_sorted
+    """
+    words_freq_sorted = list()
+    word_freq_dict = collections.defaultdict(int)
+    download_data_if_not_yet()
+
+    for category in movie_reviews.categories():
+        for field in movie_reviews.fileids(category):
+            for words in movie_reviews.words(field):
+                word_freq_dict[words] += 1
+    words_sort_list = word_freq_dict.items()
+    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
+    for index, word in enumerate(words_sort_list):
+        words_freq_sorted.append((word[0], index))
+    return words_freq_sorted
+
+
+def sort_files():
+    """
+    Sorted the sample for cross reading the sample
+    :return:
+        files_list
+    """
+    files_list = list()
+    neg_file_list = movie_reviews.fileids('neg')
+    pos_file_list = movie_reviews.fileids('pos')
+    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    return files_list
+
+
+def load_sentiment_data():
+    """
+    Load the data set
+    :return:
+        data_set
+    """
+    data_set = list()
+    download_data_if_not_yet()
+    words_ids = dict(get_word_dict())
+    for sample_file in sort_files():
+        words_list = list()
+        category = 0 if 'neg' in sample_file else 1
+        for word in movie_reviews.words(sample_file):
+            words_list.append(words_ids[word.lower()])
+        data_set.append((words_list, category))
+    return data_set
+
+
+def reader_creator(data):
+    """
+    Reader creator, generate an iterator for data set
+    :param data:
+        train data set or test data set
+    """
+    for each in data:
+        yield each[0], each[1]
+
+
+def train():
+    """
+    Default training set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
+
+
+def test():
+    """
+    Default test set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
+
+
+def fetch():
+    nltk.download(
+        'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
+    paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py
new file mode 100644
index 0000000000..e0e18229da
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/cifar_test.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.cifar
+import unittest
+
+
+class TestCIFAR(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3072)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_test10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test10())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_train10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train10())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test100())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 99)
+
+    def test_train100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train100())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
new file mode 100644
index 0000000000..cfa194eba3
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.common
+import unittest
+import tempfile
+import glob
+
+
+class TestCommon(unittest.TestCase):
+    def test_md5file(self):
+        _, temp_path = tempfile.mkstemp()
+        with open(temp_path, 'w') as f:
+            f.write("Hello\n")
+        self.assertEqual('09f7e02f1290be211da707a266f153b3',
+                         paddle.v2.dataset.common.md5file(temp_path))
+
+    def test_download(self):
+        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
+        self.assertEqual(
+            paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.v2.dataset.common.download(
+                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
+
+    def test_split(self):
+        def test_reader():
+            def reader():
+                for x in xrange(10):
+                    yield x
+
+            return reader
+
+        _, temp_path = tempfile.mkstemp()
+        paddle.v2.dataset.common.split(
+            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
+        files = glob.glob(temp_path + '/test-%05d.pickle')
+        self.assertEqual(len(files), 3)
+
+    def test_cluster_file_reader(self):
+        _, temp_path = tempfile.mkstemp()
+        for x in xrange(5):
+            with open(temp_path + '/%05d.test' % x) as f:
+                f.write('%d\n' % x)
+        reader = paddle.v2.dataset.common.cluster_files_reader(
+            temp_path + '/*.test', 5, 0)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str("0"))
+
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+
+        def test_reader():
+            def reader():
+                for x in xrange(record_num):
+                    yield x
+
+            return reader
+
+        path = tempfile.mkdtemp()
+        paddle.v2.dataset.common.convert(path,
+                                         test_reader(), num_shards,
+                                         'random_images')
+
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+
+        recs.sort()
+        self.assertEqual(total, record_num)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py
new file mode 100644
index 0000000000..a8ae9a07ac
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/flowers_test.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.flowers
+import unittest
+
+
+class TestFlowers(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        size = 224 * 224 * 3
+        for l in reader():
+            self.assertEqual(l[0].size, size)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.train())
+        self.assertEqual(instances, 6149)
+        self.assertEqual(max_label_value, 102)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.test())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+    def test_valid(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.valid())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py
new file mode 100644
index 0000000000..c4d82f2689
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imdb_test.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.imdb
+import unittest
+import re
+
+TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+
+TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+
+
+class TestIMDB(unittest.TestCase):
+    word_idx = None
+
+    def test_build_dict(self):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        self.assertEqual(len(self.word_idx), 7036)
+
+    def check_dataset(self, dataset, expected_size):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        sum = 0
+        for l in dataset(self.word_idx):
+            self.assertEqual(l[1], sum % 2)
+            sum += 1
+        self.assertEqual(sum, expected_size)
+
+    def test_train(self):
+        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
+
+    def test_test(self):
+        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py
new file mode 100644
index 0000000000..714a75d6f1
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.imikolov
+import unittest
+
+WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
+
+
+class TestMikolov(unittest.TestCase):
+    def check_reader(self, reader, n):
+        for l in reader():
+            self.assertEqual(len(l), n)
+
+    def test_train(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
+
+        first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
+            'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
+            'rake regatta rubens sim snack-food ssangyong swapo wachter'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.v2.dataset.imikolov.train(
+                WORD_DICT, n=-1,
+                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_test(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
+
+        first_line = 'consumers may want to move their telephones a little '\
+                'closer to the tv set'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.v2.dataset.imikolov.test(
+                WORD_DICT, n=-1,
+                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_total(self):
+        _, idx = zip(*WORD_DICT.items())
+        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py
new file mode 100644
index 0000000000..1d344cac3e
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/mnist_test.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.mnist
+import unittest
+
+
+class TestMNIST(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 784)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.train())
+        self.assertEqual(instances, 60000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.test())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mq2007_test.py b/python/paddle/v2/dataset/tests/mq2007_test.py
new file mode 100644
index 0000000000..59847b6c18
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/mq2007_test.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.mq2007
+import unittest
+
+
+class TestMQ2007(unittest.TestCase):
+    def test_pairwise(self):
+        for label, query_left, query_right in paddle.v2.dataset.mq2007.test(
+                format="pairwise"):
+            self.assertEqual(query_left.shape(), (46, ))
+            self.assertEqual(query_right.shape(), (46, ))
+
+    def test_listwise(self):
+        for label_array, query_array in paddle.v2.dataset.mq2007.test(
+                format="listwise"):
+            self.assertEqual(len(label_array), len(query_array))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py
new file mode 100644
index 0000000000..4074052907
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/test_sentiment.py
@@ -0,0 +1,55 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import nltk
+import paddle.v2.dataset.sentiment as st
+from nltk.corpus import movie_reviews
+
+
+class TestSentimentMethods(unittest.TestCase):
+    def test_get_word_dict(self):
+        word_dict = st.get_word_dict()[0:10]
+        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
+                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
+                          (u'is', 8), (u'in', 9)]
+        for idx, each in enumerate(word_dict):
+            self.assertEqual(each, test_word_list[idx])
+        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
+
+    def test_sort_files(self):
+        last_label = ''
+        for sample_file in st.sort_files():
+            current_label = sample_file.split("/")[0]
+            self.assertNotEqual(current_label, last_label)
+            last_label = current_label
+
+    def test_data_set(self):
+        data_set = st.load_sentiment_data()
+        last_label = -1
+        for each in st.test():
+            self.assertNotEqual(each[1], last_label)
+            last_label = each[1]
+        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
+        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
+        self.assertEqual(
+            len(list(st.test())),
+            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py
new file mode 100644
index 0000000000..31e72ebf5e
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/voc2012_test.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.voc2012
+import unittest
+
+
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3 * l[1].size)
+            sum += 1
+        return sum
+
+    def test_train(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+
+    def test_test(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+
+    def test_val(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/v2/dataset/tests/wmt16_test.py
new file mode 100644
index 0000000000..cef6c3216e
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/wmt16_test.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.wmt16
+import unittest
+
+
+class TestWMT16(unittest.TestCase):
+    def checkout_one_sample(self, sample):
+        # train data has 3 field: source language word indices,
+        # target language word indices, and target next word indices.
+        self.assertEqual(len(sample), 3)
+
+        # test start mark and end mark in source word indices.
+        self.assertEqual(sample[0][0], 0)
+        self.assertEqual(sample[0][-1], 1)
+
+        # test start mask in target word indices
+        self.assertEqual(sample[1][0], 0)
+
+        # test en mask in target next word indices
+        self.assertEqual(sample[2][-1], 1)
+
+    def test_train(self):
+        for idx, sample in enumerate(
+                paddle.v2.dataset.wmt16.train(
+                    src_dict_size=100000, trg_dict_size=100000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_test(self):
+        for idx, sample in enumerate(
+                paddle.v2.dataset.wmt16.test(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_val(self):
+        for idx, sample in enumerate(
+                paddle.v2.dataset.wmt16.validation(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_get_dict(self):
+        dict_size = 1000
+        word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True)
+        self.assertEqual(len(word_dict), dict_size)
+        self.assertEqual(word_dict[0], "<s>")
+        self.assertEqual(word_dict[1], "<e>")
+        self.assertEqual(word_dict[2], "<unk>")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
new file mode 100644
index 0000000000..f10bf7e42a
--- /dev/null
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UCI Housing dataset.
+
+This module will download dataset from
+https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
+parse training set and test set into paddle reader creators.
+"""
+
+import numpy as np
+import os
+import paddle.v2.dataset.common
+from paddle.v2.parameters import Parameters
+
+__all__ = ['train', 'test']
+
+URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT', 'convert'
+]
+
+UCI_TRAIN_DATA = None
+UCI_TEST_DATA = None
+URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
+MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
+
+
+def feature_range(maximums, minimums):
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+    feature_num = len(maximums)
+    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.set_title('feature scale')
+    plt.xticks(range(feature_num), feature_names)
+    plt.xlim([-1, feature_num])
+    fig.set_figheight(6)
+    fig.set_figwidth(10)
+    if not os.path.exists('./image'):
+        os.makedirs('./image')
+    fig.savefig('image/ranges.png', dpi=48)
+    plt.close(fig)
+
+
+def load_data(filename, feature_num=14, ratio=0.8):
+    global UCI_TRAIN_DATA, UCI_TEST_DATA
+    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
+        return
+
+    data = np.fromfile(filename, sep=' ')
+    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+        axis=0) / data.shape[0]
+    feature_range(maximums[:-1], minimums[:-1])
+    for i in xrange(feature_num - 1):
+        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+    offset = int(data.shape[0] * ratio)
+    UCI_TRAIN_DATA = data[:offset]
+    UCI_TEST_DATA = data[offset:]
+
+
+def train():
+    """
+    UCI_HOUSING training set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    global UCI_TRAIN_DATA
+    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TRAIN_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def test():
+    """
+    UCI_HOUSING test set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    global UCI_TEST_DATA
+    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TEST_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def model():
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
+                                                 MD5_MODEL)
+    with open(tar_file, 'r') as f:
+        parameters = Parameters.from_tar(f)
+    return parameters
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py
new file mode 100644
index 0000000000..617e212d67
--- /dev/null
+++ b/python/paddle/v2/dataset/voc2012.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image dataset for segmentation.
+The 2012 dataset contains images from 2008-2011 for which additional
+segmentations have been prepared. As in previous years the assignment
+to training/test sets has been maintained. The total number of images
+with segmentation has been increased from 7,062 to 9,993.
+"""
+
+import tarfile
+import io
+import numpy as np
+from paddle.v2.dataset.common import download
+from paddle.v2.image import *
+from PIL import Image
+
+__all__ = ['train', 'test', 'val']
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+
+def reader_creator(filename, sub_name):
+
+    tarobject = tarfile.open(filename)
+    name2mem = {}
+    for ele in tarobject.getmembers():
+        name2mem[ele.name] = ele
+
+    def reader():
+        set_file = SET_FILE.format(sub_name)
+        sets = tarobject.extractfile(name2mem[set_file])
+        for line in sets:
+            line = line.strip()
+            data_file = DATA_FILE.format(line)
+            label_file = LABEL_FILE.format(line)
+            data = tarobject.extractfile(name2mem[data_file]).read()
+            label = tarobject.extractfile(name2mem[label_file]).read()
+            data = Image.open(io.BytesIO(data))
+            label = Image.open(io.BytesIO(label))
+            data = np.array(data)
+            label = np.array(label)
+            yield data, label
+
+    return reader
+
+
+def train():
+    """
+    Create a train dataset reader containing 2913 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
+
+
+def test():
+    """
+    Create a test dataset reader containing 1464 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
+
+
+def val():
+    """
+    Create a val dataset reader containing 1449 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
new file mode 100644
index 0000000000..5104e29051
--- /dev/null
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+WMT14 dataset.
+The original WMT14 dataset is too large and a small set of data for set is
+provided. This module will download dataset from
+http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+parse training set and test set into paddle reader creators.
+
+"""
+import tarfile
+import gzip
+
+import paddle.v2.dataset.common
+from paddle.v2.parameters import Parameters
+
+__all__ = [
+    'train',
+    'test',
+    'get_dict',
+    'convert',
+]
+
+URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
+                'cslm_joint_paper/data/dev+test.tgz')
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and
+# will be add later.
+URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
+             'wmt_shrinked_data/wmt14.tgz')
+MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
+# BLEU of this trained model is 26.92
+URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+def __read_to_dict(tar_file, dict_size):
+    def __to_dict(fd, size):
+        out_dict = dict()
+        for line_count, line in enumerate(fd):
+            if line_count < size:
+                out_dict[line.strip()] = line_count
+            else:
+                break
+        return out_dict
+
+    with tarfile.open(tar_file, mode='r') as f:
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("src.dict")
+        ]
+        assert len(names) == 1
+        src_dict = __to_dict(f.extractfile(names[0]), dict_size)
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("trg.dict")
+        ]
+        assert len(names) == 1
+        trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
+        return src_dict, trg_dict
+
+
+def reader_creator(tar_file, file_name, dict_size):
+    def reader():
+        src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
+        with tarfile.open(tar_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [trg_dict[END]]
+                    trg_ids = [trg_dict[START]] + trg_ids
+
+                    yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(dict_size):
+    """
+    WMT14 training set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'train/train', dict_size)
+
+
+def test(dict_size):
+    """
+    WMT14 test set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'test/test', dict_size)
+
+
+def gen(dict_size):
+    return reader_creator(
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'gen/gen', dict_size)
+
+
+def model():
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+    with gzip.open(tar_file, 'r') as f:
+        parameters = Parameters.from_tar(f)
+    return parameters
+
+
+def get_dict(dict_size, reverse=True):
+    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
+    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
+    tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
+    if reverse:
+        src_dict = {v: k for k, v in src_dict.items()}
+        trg_dict = {v: k for k, v in trg_dict.items()}
+    return src_dict, trg_dict
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    dict_size = 30000
+    paddle.v2.dataset.common.convert(path,
+                                     train(dict_size), 1000, "wmt14_train")
+    paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py
new file mode 100644
index 0000000000..c8818f715b
--- /dev/null
+++ b/python/paddle/v2/dataset/wmt16.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ACL2016 Multimodal Machine Translation. Please see this website for more
+details: http://www.statmt.org/wmt16/multimodal-task.html#task1
+
+If you use the dataset created for your task, please cite the following paper:
+Multi30K: Multilingual English-German Image Descriptions.
+
+@article{elliott-EtAl:2016:VL16,
+ author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
+ title     = {Multi30K: Multilingual English-German Image Descriptions},
+ booktitle = {Proceedings of the 6th Workshop on Vision and Language},
+ year      = {2016},
+ pages     = {70--74},
+ year      = 2016
+}
+"""
+
+import os
+import tarfile
+import gzip
+from collections import defaultdict
+
+import paddle.v2.dataset.common
+
+__all__ = [
+    "train",
+    "test",
+    "validation",
+    "convert",
+    "fetch",
+    "get_dict",
+]
+
+DATA_URL = ("http://cloud.dlnel.org/filepub/"
+            "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed")
+DATA_MD5 = "0c38be43600334966403524a40dcd81e"
+
+TOTAL_EN_WORDS = 11250
+TOTAL_DE_WORDS = 19220
+
+START_MARK = "<s>"
+END_MARK = "<e>"
+UNK_MARK = "<unk>"
+
+
+def __build_dict(tar_file, dict_size, save_path, lang):
+    word_dict = defaultdict(int)
+    with tarfile.open(tar_file, mode="r") as f:
+        for line in f.extractfile("wmt16/train"):
+            line_split = line.strip().split("\t")
+            if len(line_split) != 2: continue
+            sen = line_split[0] if lang == "en" else line_split[1]
+            for w in sen.split():
+                word_dict[w] += 1
+
+    with open(save_path, "w") as fout:
+        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
+        for idx, word in enumerate(
+                sorted(
+                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+            if idx + 3 == dict_size: break
+            fout.write("%s\n" % (word[0]))
+
+
+def __load_dict(tar_file, dict_size, lang, reverse=False):
+    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    if not os.path.exists(dict_path) or (
+            len(open(dict_path, "r").readlines()) != dict_size):
+        __build_dict(tar_file, dict_size, dict_path, lang)
+
+    word_dict = {}
+    with open(dict_path, "r") as fdict:
+        for idx, line in enumerate(fdict):
+            if reverse:
+                word_dict[idx] = line.strip()
+            else:
+                word_dict[line.strip()] = idx
+    return word_dict
+
+
+def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
+    src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
+                                        TOTAL_DE_WORDS))
+    trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
+                                        TOTAL_ENG_WORDS))
+    return src_dict_size, trg_dict_size
+
+
+def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
+    def reader():
+        src_dict = __load_dict(tar_file, src_dict_size, src_lang)
+        trg_dict = __load_dict(tar_file, trg_dict_size,
+                               ("de" if src_lang == "en" else "en"))
+
+        # the indice for start mark, end mark, and unk are the same in source
+        # language and target language. Here uses the source language
+        # dictionary to determine their indices.
+        start_id = src_dict[START_MARK]
+        end_id = src_dict[END_MARK]
+        unk_id = src_dict[UNK_MARK]
+
+        src_col = 0 if src_lang == "en" else 1
+        trg_col = 1 - src_col
+
+        with tarfile.open(tar_file, mode="r") as f:
+            for line in f.extractfile(file_name):
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2:
+                    continue
+                src_words = line_split[src_col].split()
+                src_ids = [start_id] + [
+                    src_dict.get(w, unk_id) for w in src_words
+                ] + [end_id]
+
+                trg_words = line_split[trg_col].split()
+                trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
+
+                trg_ids_next = trg_ids + [end_id]
+                trg_ids = [start_id] + trg_ids
+
+                yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 train set reader.
+
+    This function returns the reader for train data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+
+    NOTE:
+    The original like for training data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The train reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type.  Only support: "
+                         "en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                   "wmt16.tar.gz"),
+        file_name="wmt16/train",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def test(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 test set reader.
+
+    This function returns the reader for test data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for test data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The test reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                   "wmt16.tar.gz"),
+        file_name="wmt16/test",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def validation(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 validation set reader.
+
+    This function returns the reader for validation data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for validation data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The validation reader.
+    """
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                   "wmt16.tar.gz"),
+        file_name="wmt16/val",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def get_dict(lang, dict_size, reverse=False):
+    """
+    return the word dictionary for the specified language.
+
+    Args:
+        lang(string): A string indicating which language is the source
+                      language. Available options are: "en" for English
+                      and "de" for Germany.
+        dict_size(int): Size of the specified language dictionary.
+        reverse(bool): If reverse is set to False, the returned python
+                       dictionary will use word as key and use index as value.
+                       If reverse is set to True, the returned python
+                       dictionary will use index as key and word as value.
+
+    Returns:
+        dict: The word dictionary for the specific language.
+    """
+
+    if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS)
+    else: dict_size = min(dict_size, TOTAL_DE_WORDS)
+
+    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    assert os.path.exists(dict_path), "Word dictionary does not exist. "
+    "Please invoke paddle.dataset.wmt16.train/test/validation first "
+    "to build the dictionary."
+    tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz")
+    return __load_dict(tar_file, dict_size, lang, reverse)
+
+
+def fetch():
+    """download the entire dataset.
+    """
+    paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                      "wmt16.tar.gz")
+
+
+def convert(path, src_dict_size, trg_dict_size, src_lang):
+    """Converts dataset to recordio format.
+    """
+
+    paddle.v2.dataset.common.convert(
+        path,
+        train(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_train")
+    paddle.v2.dataset.common.convert(
+        path,
+        test(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_test")
+    paddle.v2.dataset.common.convert(
+        path,
+        validation(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_validation")
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
new file mode 100644
index 0000000000..9235c41e9e
--- /dev/null
+++ b/python/paddle/v2/image.py
@@ -0,0 +1,381 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains some common interfaces for image preprocess.
+Many users are confused about the image layout. We introduce
+the image layout as follows.
+
+- CHW Layout
+
+  - The abbreviations: C=channel, H=Height, W=Width
+  - The default layout of image opened by cv2 or PIL is HWC.
+    PaddlePaddle only supports the CHW layout. And CHW is simply
+    a transpose of HWC. It must transpose the input image.
+
+- Color format: RGB or BGR
+
+  OpenCV use BGR color format. PIL use RGB color format. Both
+  formats can be used for training. Noted that, the format should
+  be keep consistent between the training and inference peroid.
+"""
+import numpy as np
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+import os
+import tarfile
+import cPickle
+
+__all__ = [
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
+]
+
+
+def batch_images_from_tar(data_file,
+                          dataset_name,
+                          img2label,
+                          num_per_batch=1024):
+    """
+    Read images from tar file and batch them into batch file.
+
+    :param data_file: path of image tar file
+    :type data_file: string
+    :param dataset_name: 'train','test' or 'valid'
+    :type dataset_name: string
+    :param img2label: a dic with image file name as key 
+                    and image's label as value
+    :type img2label: dic
+    :param num_per_batch: image number per batch file
+    :type num_per_batch: int
+    :return: path of list file containing paths of batch file
+    :rtype: string
+    """
+    batch_dir = data_file + "_batch"
+    out_path = "%s/%s" % (batch_dir, dataset_name)
+    meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
+
+    if os.path.exists(out_path):
+        return meta_file
+    else:
+        os.makedirs(out_path)
+
+    tf = tarfile.open(data_file)
+    mems = tf.getmembers()
+    data = []
+    labels = []
+    file_id = 0
+    for mem in mems:
+        if mem.name in img2label:
+            data.append(tf.extractfile(mem).read())
+            labels.append(img2label[mem.name])
+            if len(data) == num_per_batch:
+                output = {}
+                output['label'] = labels
+                output['data'] = data
+                cPickle.dump(
+                    output,
+                    open('%s/batch_%d' % (out_path, file_id), 'w'),
+                    protocol=cPickle.HIGHEST_PROTOCOL)
+                file_id += 1
+                data = []
+                labels = []
+    if len(data) > 0:
+        output = {}
+        output['label'] = labels
+        output['data'] = data
+        cPickle.dump(
+            output,
+            open('%s/batch_%d' % (out_path, file_id), 'w'),
+            protocol=cPickle.HIGHEST_PROTOCOL)
+
+    with open(meta_file, 'a') as meta:
+        for file in os.listdir(out_path):
+            meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
+    return meta_file
+
+
+def load_image_bytes(bytes, is_color=True):
+    """
+    Load an color or gray image from bytes array.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        with open('cat.jpg') as f:
+            im = load_image_bytes(f.read())
+
+    :param bytes: the input image bytes array.
+    :type bytes: str
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    :type is_color: bool
+    """
+    flag = 1 if is_color else 0
+    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
+    img = cv2.imdecode(file_bytes, flag)
+    return img
+
+
+def load_image(file, is_color=True):
+    """
+    Load an color or gray image from the file path.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+
+    :param file: the input image path.
+    :type file: string
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    :type is_color: bool
+    """
+    # cv2.IMAGE_COLOR for OpenCV3
+    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
+    # cv2.IMAGE_GRAYSCALE for OpenCV3
+    # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version
+    # Here, use constant 1 and 0
+    # 1: COLOR, 0: GRAYSCALE
+    flag = 1 if is_color else 0
+    im = cv2.imread(file, flag)
+    return im
+
+
+def resize_short(im, size):
+    """ 
+    Resize an image so that the length of shorter edge is size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the shorter edge size of image after resizing.
+    :type size: int
+    """
+    h, w = im.shape[:2]
+    h_new, w_new = size, size
+    if h > w:
+        h_new = size * h / w
+    else:
+        w_new = size * w / h
+    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    return im
+
+
+def to_chw(im, order=(2, 0, 1)):
+    """
+    Transpose the input image order. The image layout is HWC format
+    opened by cv2 or PIL. Transpose the input image to CHW layout
+    according the order (2,0,1).
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+        im = to_chw(im)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param order: the transposed order.
+    :type order: tuple|list 
+    """
+    assert len(im.shape) == len(order)
+    im = im.transpose(order)
+    return im
+
+
+def center_crop(im, size, is_color=True):
+    """
+    Crop the center of image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = center_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = (h - size) / 2
+    w_start = (w - size) / 2
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def random_crop(im, size, is_color=True):
+    """
+    Randomly crop input image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = random_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = np.random.randint(0, h - size + 1)
+    w_start = np.random.randint(0, w - size + 1)
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def left_right_flip(im, is_color=True):
+    """
+    Flip an image along the horizontal direction.
+    Return the flipped image.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = left_right_flip(im)
+    
+    :param im: input image with HWC layout or HW layout for gray image
+    :type im: ndarray
+    :param is_color: whether input image is color or not
+    :type is_color: bool
+    """
+    if len(im.shape) == 3 and is_color:
+        return im[:, ::-1, :]
+    else:
+        return im[:, ::-1]
+
+
+def simple_transform(im,
+                     resize_size,
+                     crop_size,
+                     is_train,
+                     is_color=True,
+                     mean=None):
+    """
+    Simply data argumentation for training. These operations include
+    resizing, croping and flipping.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = simple_transform(im, 256, 224, True)
+
+    :param im: The input image with HWC layout.
+    :type im: ndarray
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
+    """
+    im = resize_short(im, resize_size)
+    if is_train:
+        im = random_crop(im, crop_size, is_color=is_color)
+        if np.random.randint(2) == 0:
+            im = left_right_flip(im, is_color)
+    else:
+        im = center_crop(im, crop_size, is_color)
+        im = center_crop(im, crop_size, is_color=is_color)
+    if len(im.shape) == 3:
+        im = to_chw(im)
+
+    im = im.astype('float32')
+    if mean is not None:
+        mean = np.array(mean, dtype=np.float32)
+        # mean value, may be one value per channel 
+        if mean.ndim == 1 and is_color:
+            mean = mean[:, np.newaxis, np.newaxis]
+        elif mean.ndim == 1:
+            mean = mean
+        else:
+            # elementwise mean
+            assert len(mean.shape) == len(im)
+        im -= mean
+
+    return im
+
+
+def load_and_transform(filename,
+                       resize_size,
+                       crop_size,
+                       is_train,
+                       is_color=True,
+                       mean=None):
+    """
+    Load image from the input file `filename` and transform image for
+    data argumentation. Please refer to the `simple_transform` interface
+    for the transform operations.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_and_transform('cat.jpg', 256, 224, True)
+
+    :param filename: The file name of input image.
+    :type filename: string
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
+    """
+    im = load_image(filename, is_color)
+    im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
+    return im
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
new file mode 100644
index 0000000000..317cf037c6
--- /dev/null
+++ b/python/paddle/v2/minibatch.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['batch']
+
+
+def batch(reader, batch_size):
+    """
+    Create a batched reader.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :return: the batched reader.
+    :rtype: callable
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if b:
+            yield b
+
+    return batch_reader
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
new file mode 100644
index 0000000000..3b059735a9
--- /dev/null
+++ b/python/paddle/v2/reader/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+At training and testing time, PaddlePaddle programs need to read data. To ease
+the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number
+  generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and
+  returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network,
+  random number generator, etc) and yields a batch of data items.
+
+#####################
+Data Reader Interface
+#####################
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data
+items. It can be any function with no parameter that creates a iterable
+(anything can be used in :code:`for x in iterable`)\:
+
+..  code-block:: python
+
+    iterable = data_reader()
+
+Element produced from the iterable should be a **single** entry of data,
+**not** a mini batch. That entry of data could be a single item, or a tuple of
+items.
+Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
+/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
+array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image(width, height):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+An example implementation for multiple item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image_and_label(width, height, label):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+
+
+TODO(yuyang18): Should we add whole design doc here?
+"""
+
+import decorator
+from decorator import *
+
+import creator
+
+__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
new file mode 100644
index 0000000000..fda5246d74
--- /dev/null
+++ b/python/paddle/v2/reader/creator.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Creator package contains some simple reader creator, which could
+be used in user program.
+"""
+
+__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
+
+
+def np_array(x):
+    """
+    Creates a reader that yields elements of x, if it is a
+    numpy vector. Or rows of x, if it is a numpy matrix.
+    Or any sub-hyperplane indexed by the highest dimension.
+
+    :param x: the numpy array to create reader from.
+    :returns: data reader created from x.
+    """
+
+    def reader():
+        if x.ndim < 1:
+            yield x
+
+        for e in x:
+            yield e
+
+    return reader
+
+
+def text_file(path):
+    """
+    Creates a data reader that outputs text line by line from given text file.
+    Trailing new line ('\\\\n') of each line will be removed.
+
+    :path: path of the text file.
+    :returns: data reader of text file
+    """
+
+    def reader():
+        f = open(path, "r")
+        for l in f:
+            yield l.rstrip('\n')
+        f.close()
+
+    return reader
+
+
+def recordio(paths, buf_size=100):
+    """
+    Creates a data reader from given RecordIO file paths separated by ",",
+        glob pattern is supported.
+    :path: path of recordio files, can be a string or a string list.
+    :returns: data reader of recordio files.
+    """
+
+    import recordio as rec
+    import paddle.v2.reader.decorator as dec
+    import cPickle as pickle
+
+    def reader():
+        if isinstance(paths, basestring):
+            path = paths
+        else:
+            path = ",".join(paths)
+        f = rec.reader(path)
+        while True:
+            r = f.read()
+            if r is None:
+                break
+            yield pickle.loads(r)
+        f.close()
+
+    return dec.buffered(reader, buf_size)
+
+
+pass_num = 0
+
+
+def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
+    """
+    Create a data reader that yield a record one by one from
+        the paths:
+    :paths: path of recordio files, can be a string or a string list.
+    :etcd_endpoints: the endpoints for etcd cluster
+    :returns: data reader of recordio files.
+
+    ..  code-block:: python
+        from paddle.v2.reader.creator import cloud_reader
+        etcd_endpoints = "http://127.0.0.1:2379"
+        trainer.train.(
+            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
+        )
+    """
+    import os
+    import cPickle as pickle
+    import paddle.v2.master as master
+    c = master.client(etcd_endpoints, timeout_sec, buf_size)
+
+    if isinstance(paths, basestring):
+        path = [paths]
+    else:
+        path = paths
+    c.set_dataset(path)
+
+    def reader():
+        global pass_num
+        c.paddle_start_get_records(pass_num)
+        pass_num += 1
+
+        while True:
+            r, e = c.next_record()
+            if not r:
+                if e != -2:
+                    print "get record error: ", e
+                break
+            yield pickle.loads(r)
+
+    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
new file mode 100644
index 0000000000..44a6e34463
--- /dev/null
+++ b/python/paddle/v2/reader/decorator.py
@@ -0,0 +1,405 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
+]
+
+from threading import Thread
+import subprocess
+
+from Queue import Queue
+import itertools
+import random
+import zlib
+
+
+def map_readers(func, *readers):
+    """
+    Creates a data reader that outputs return value of function using
+    output of each data readers as arguments.
+
+    :param func: function to use. The type of func should be (Sample) => Sample
+    :type: callable
+    :param readers: readers whose outputs will be used as arguments of func.
+    :return: the created data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        for e in itertools.imap(func, *rs):
+            yield e
+
+    return reader
+
+
+def shuffle(reader, buf_size):
+    """
+    Creates a data reader whose data output is shuffled.
+
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    :param reader: the original reader whose output will be shuffled.
+    :type reader: callable
+    :param buf_size: shuffle buffer size.
+    :type buf_size: int
+
+    :return: the new reader whose output is shuffled.
+    :rtype: callable
+    """
+
+    def data_reader():
+        buf = []
+        for e in reader():
+            buf.append(e)
+            if len(buf) >= buf_size:
+                random.shuffle(buf)
+                for b in buf:
+                    yield b
+                buf = []
+
+        if len(buf) > 0:
+            random.shuffle(buf)
+            for b in buf:
+                yield b
+
+    return data_reader
+
+
+def chain(*readers):
+    """
+    Creates a data reader whose output is the outputs of input data
+    readers chained together.
+
+    If input readers output following data entries:
+    [0, 0, 0]
+    [1, 1, 1]
+    [2, 2, 2]
+    The chained reader will output:
+    [0, 0, 0, 1, 1, 1, 2, 2, 2]
+
+    :param readers: input readers.
+    :return: the new data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+
+        for e in itertools.chain(*rs):
+            yield e
+
+    return reader
+
+
+class ComposeNotAligned(ValueError):
+    pass
+
+
+def compose(*readers, **kwargs):
+    """
+    Creates a data reader whose output is the combination of input readers.
+
+    If input readers output following data entries:
+    (1, 2)    3    (4, 5)
+    The composed reader will output:
+    (1, 2, 3, 4, 5)
+
+    :param readers: readers that will be composed together.
+    :param check_alignment: if True, will check if input readers are aligned
+        correctly. If False, will not check alignment and trailing outputs
+        will be discarded. Defaults to True.
+    :type check_alignment: bool
+
+    :return: the new data reader.
+
+    :raises ComposeNotAligned: outputs of readers are not aligned.
+        Will not raise when check_alignment is set to False.
+    """
+    check_alignment = kwargs.pop('check_alignment', True)
+
+    def make_tuple(x):
+        if isinstance(x, tuple):
+            return x
+        else:
+            return (x, )
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        if not check_alignment:
+            for outputs in itertools.izip(*rs):
+                yield sum(map(make_tuple, outputs), ())
+        else:
+            for outputs in itertools.izip_longest(*rs):
+                for o in outputs:
+                    if o is None:
+                        # None will be not be present if compose is aligned
+                        raise ComposeNotAligned(
+                            "outputs of readers are not aligned.")
+                yield sum(map(make_tuple, outputs), ())
+
+    return reader
+
+
+def buffered(reader, size):
+    """
+    Creates a buffered data reader.
+
+    The buffered data reader will read and save data entries into a
+    buffer. Reading from the buffered data reader will proceed as long
+    as the buffer is not empty.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param size: max buffer size.
+    :type size: int
+
+    :returns: the buffered data reader.
+    """
+
+    class EndSignal():
+        pass
+
+    end = EndSignal()
+
+    def read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+
+    def data_reader():
+        r = reader()
+        q = Queue(maxsize=size)
+        t = Thread(
+            target=read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+
+    return data_reader
+
+
+def firstn(reader, n):
+    """
+    Limit the max number of samples that reader could return.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param n: the max number of samples that return.
+    :type n: int
+    :return: the decorated reader.
+    :rtype: callable
+    """
+
+    # TODO(yuyang18): Check if just drop the reader, could clean the opened
+    # resource or not?
+
+    def firstn_reader():
+        for i, item in enumerate(reader()):
+            if i == n:
+                break
+            yield item
+
+    return firstn_reader
+
+
+class XmapEndSignal():
+    pass
+
+
+def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
+    """
+    Use multiprocess to map samples from reader by a mapper defined by user.
+    And this function contains a buffered decorator.
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param reader: the data reader to read from
+    :type reader: callable
+    :param process_num: process number to handle original sample
+    :type process_num: int
+    :param buffer_size: max buffer size
+    :type buffer_size: int
+    :param order: keep the order of reader
+    :type order: bool
+    :return: the decarated reader
+    :rtype: callable
+    """
+    end = XmapEndSignal()
+
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    def xreader():
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
+        sample = out_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+
+    return xreader
+
+
+def _buf2lines(buf, line_break="\n"):
+    # FIXME: line_break should be automatically configured.
+    lines = buf.split(line_break)
+    return lines[:-1], lines[-1]
+
+
+class PipeReader:
+    """
+        PipeReader read data by stream from a command, take it's 
+        stdout into a pipe buffer and redirect it to the parser to
+        parse, then yield data as your desired format.
+
+        You can using standard linux command or call another program
+        to read data, from HDFS, Ceph, URL, AWS S3 etc:
+
+        .. code-block:: python
+           cmd = "hadoop fs -cat /path/to/some/file"
+           cmd = "cat sample_file.tar.gz"
+           cmd = "curl http://someurl"
+           cmd = "python print_s3_bucket.py"
+
+        An example:
+
+        .. code-block:: python
+    
+           def example_reader():
+               for f in myfiles:
+                   pr = PipeReader("cat %s"%f)
+                   for l in pr.get_line():
+                       sample = l.split(" ")
+                       yield sample
+    """
+
+    def __init__(self, command, bufsize=8192, file_type="plain"):
+        if not isinstance(command, str):
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
+                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+
+    def get_line(self, cut_lines=True, line_break="\n"):
+        """
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
+        remained = ""
+        while True:
+            buff = self.process.stdout.read(self.bufsize)
+            if buff:
+                if self.file_type == "gzip":
+                    decomp_buff = self.dec.decompress(buff)
+                elif self.file_type == "plain":
+                    decomp_buff = buff
+                else:
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
+
+                if cut_lines:
+                    lines, remained = _buf2lines(''.join(
+                        [remained, decomp_buff]), line_break)
+                    for line in lines:
+                        yield line
+                else:
+                    yield decomp_buff
+            else:
+                break
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt
new file mode 100644
index 0000000000..107d5912e1
--- /dev/null
+++ b/python/paddle/v2/reader/tests/CMakeLists.txt
@@ -0,0 +1,2 @@
+py_test(creator_test SRCS creator_test.py)
+py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py
new file mode 100644
index 0000000000..eca2dce114
--- /dev/null
+++ b/python/paddle/v2/reader/tests/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
new file mode 100644
index 0000000000..7fe374e663
--- /dev/null
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright PaddlePaddle contributors. All Rights Reservedd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import numpy as np
+import paddle.v2.reader.creator
+
+
+class TestNumpyArray(unittest.TestCase):
+    def test_numpy_array(self):
+        l = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(l, np.int32)
+        reader = paddle.v2.reader.creator.np_array(x)
+        for idx, e in enumerate(reader()):
+            self.assertItemsEqual(e, l[idx])
+
+
+class TestTextFile(unittest.TestCase):
+    def test_text_file(self):
+        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
+        reader = paddle.v2.reader.creator.text_file(path)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
+
+
+class TestRecordIO(unittest.TestCase):
+    def do_test(self, path):
+        reader = paddle.v2.reader.creator.recordio(path)
+        idx = 0
+        for e in reader():
+            if idx == 0:
+                self.assertEqual(e, (1, 2, 3))
+            elif idx == 1:
+                self.assertEqual(e, (4, 5, 6))
+            idx += 1
+        self.assertEqual(idx, 2)
+
+    def test_recordIO(self):
+        self.do_test(
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat"))
+        self.do_test([
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat")
+        ])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
new file mode 100644
index 0000000000..6b680e39f3
--- /dev/null
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -0,0 +1,178 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import unittest
+
+import paddle.v2.reader
+
+
+def reader_creator_10(dur):
+    def reader():
+        for i in range(10):
+            # this invocation helps testing paddle.reader.buffer
+            time.sleep(dur)
+            yield i
+
+    return reader
+
+
+class TestMap(unittest.TestCase):
+    def test_map(self):
+        d = {"h": 0, "i": 1}
+
+        def tokenize(x):
+            return d[x]
+
+        def read():
+            yield "h"
+            yield "i"
+
+        r = paddle.v2.reader.map_readers(tokenize, read)
+        for i, e in enumerate(r()):
+            self.assertEqual(e, i)
+
+
+class TestBuffered(unittest.TestCase):
+    def test_read(self):
+        for size in range(20):
+            b = paddle.v2.reader.buffered(reader_creator_10(0), size)
+            c = 0
+            for i in b():
+                self.assertEqual(i, c)
+                c += 1
+            self.assertEqual(c, 10)
+
+    def test_buffering(self):
+        # read have 30ms delay.
+        b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
+        last_time = time.time()
+        for idx, i in enumerate(b()):
+            elapsed_time = time.time() - last_time
+            if i == 0:
+                time.sleep(0.3)
+            else:
+                # read time should be short, meaning already buffered.
+                self.assertLess(elapsed_time, 0.05)
+            last_time = time.time()
+
+
+class TestCompose(unittest.TestCase):
+    def test_compse(self):
+        reader = paddle.v2.reader.compose(
+            reader_creator_10(0), reader_creator_10(0))
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, (idx, idx))
+
+    def test_compose_not_aligned(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0))
+        with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
+            for e in reader():
+                total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+    def test_compose_not_aligned_no_check(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0),
+            check_alignment=False)
+        for e in reader():
+            total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+
+class TestChain(unittest.TestCase):
+    def test_chain(self):
+        c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
+        idx = 0
+        for e in c():
+            self.assertEqual(e, idx % 10)
+            idx += 1
+        self.assertEqual(idx, 20)
+
+
+class TestShuffle(unittest.TestCase):
+    def test_shuffle(self):
+        case = [(0, True), (1, True), (10, False), (100, False)]
+        a = reader_creator_10(0)
+        for size, checkEq in case:
+            s = paddle.v2.reader.shuffle(a, size)
+            total = 0
+            for idx, e in enumerate(s()):
+                if checkEq:
+                    self.assertEqual(idx, e)
+                total += 1
+            self.assertEqual(total, 10)
+
+
+class TestXmap(unittest.TestCase):
+    def test_xmap(self):
+        def mapper(x):
+            return (x + 1)
+
+        orders = (True, False)
+        thread_nums = (1, 2, 4, 8, 16)
+        buffered_size = (1, 2, 4, 8, 16)
+        for order in orders:
+            for tNum in thread_nums:
+                for size in buffered_size:
+                    reader = paddle.v2.reader.xmap_readers(mapper,
+                                                           reader_creator_10(0),
+                                                           tNum, size, order)
+                    for n in xrange(3):
+                        result = []
+                        for i in reader():
+                            result.append(i)
+                        if not order:
+                            result.sort()
+                        for idx, e in enumerate(result):
+                            self.assertEqual(e, mapper(idx))
+
+
+class TestPipeReader(unittest.TestCase):
+    def test_pipe_reader(self):
+        def example_reader(myfiles):
+            for f in myfiles:
+                pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
+                for l in pr.get_line():
+                    yield l
+
+        import tempfile
+
+        records = [str(i) for i in xrange(5)]
+        temp = tempfile.NamedTemporaryFile()
+        try:
+            with open(temp.name, 'w') as f:
+                for r in records:
+                    f.write('%s\n' % r)
+
+            result = []
+            for r in example_reader([temp.name]):
+                result.append(r)
+
+            for idx, e in enumerate(records):
+                self.assertEqual(e, result[idx])
+        finally:
+            # delete the temporary file
+            temp.close()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt
new file mode 100644
index 0000000000..a2a8d47d43
--- /dev/null
+++ b/python/paddle/v2/reader/tests/test_data_creator.txt
@@ -0,0 +1,3 @@
+0 1
+2 3
+4 5
diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat
new file mode 100644
index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491
GIT binary patch
literal 76
zcmZQ!W@4P2Bs!asfq}sSh?#)+KN|x>v0q|9K_sIV14Bftj}1RiRKwGd%hQO<)0nHI
Tz>rH1B4onlY0Bkk1`z@P(}N7c

literal 0
HcmV?d00001

diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat
new file mode 100644
index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69
GIT binary patch
literal 88
zcmZQ!W@2QOHw<B9U|?_oVlE*5&&I$|?3Wl&5Xor9z;M0c)+Lav0f;aJ5k?@w7(|$W
R2vZPY1|rNsgawGO1OWMk36uZ;

literal 0
HcmV?d00001

diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
index 46e4feb8e1..b4333ed530 100644
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -1,4 +1,5 @@
 py_test(test_op SRCS test_op.py)
+py_test(test_image SRCS test_image.py)
 py_test(test_layer SRCS test_layer.py)
 py_test(test_topology SRCS test_topology.py)
 py_test(test_rnn_layer SRCS test_rnn_layer.py)
diff --git a/python/paddle/v2/tests/cat.jpg b/python/paddle/v2/tests/cat.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bc1fbbd371216b9904b522ed302700c79d2e4876
GIT binary patch
literal 57218
zcmbTcXH?T&^fefI??LIk1rR9#>Ai#!N<tTvCXfh7mm<9yL+=r(352E~9psTBozN1B
z2#QnzL1`k0I{e>v*34S-X=ZL#@+H5Ob$9ML`|P{V`M30M4Zvz<Y-$W3BLe`)u3vzE
z9{~mcT54(<YARY98X7t}T6#tx6C(oyBR4w-3y_~jP=KF@k55QUPEtrjR+NuV>JCU&
zUI7dS3rebKsVZv9DS;LL=SRrs=;#<37`d33xD<u?gcbikuYY|2Hd=B=3Ly$I5db+G
z83h~JzxM#a>wQv@{jUZ1KNlG}1tk?V4J{o#!}SAgtN?N{3JP*c3Mwi}%Il}2ufGRS
zvQe=ME9g;k*tpS%1OgS~vnyyt^_%-SVY5HPl-z?7=;&|U<l^QLmynbKNrRPD?x?D%
z8yFfHn?Ov>;I?-54vtRF9-a@qynPVJ;E>R;C*cv1iAl*RscFwpIk|cH1%*Y$n93?F
zuDS+aTlc1=wXMCQv#Wbx@cq#6$mrPk9Fa7?u(<SL`OEso=GOMk*WJCN<CD{~pXa|W
zF8>D?8Gzz{!@B<b-@yJKxY(|7kyBDqP}2MlE;8~b*KZ0oN-ALmYIZ#v8n-|W5yf~~
zpni5mb3dJ^66^=3d(bTX4KeT+@uUBN_P>z*zXO)={|njw1?>OEwG3dSAiF+13O0Za
z;ByTOwm4ChWV29^1<SQ9nn|g2BR~-87S!%}^e(V-ygR*((Ji{Td|p!Ca9t;x?QQrS
zP73>G&sL$D?W+5rz7snkNFI#nfYUAyBkLzPqV#6w1I?s2?(ioJO`>i6jf|D{&5RM_
zJ#OwIiU<W58%n{k{f4lNX**8j1xi<*-UZ%mW140Sih|KQzliJI?ITjD!pJ^gv6Zy|
z;sT0^JeRCGrvy<Q#~vqDV@=LxA%Y{%YP4ou&*c;buD}80S*S26a2y)tr-?#fg^e`f
zMyhZ!A7C-Anw%J4?S(#pcs#3ofl<<>AAW(MtKNQ2VeqUn3pSJNR$YWNun@s8Fk*2+
zmoS(skO#yh!~<33al=t#xTU~d_>&<i<w`Q=Dr6v-XcV8-Re`vNU=6m73vkYNK>2Vg
zGE*_qqSi=wkdqog%^pj~>Rv;GdWN(PZ@>=6ciTl!IZ9L@Mf&rdB1uumG5q++-D=Yt
zpFlXIFHC+tRfwxi8il6MhI%cH0dN(Ou(?j^@aiWx2^1o3PPLl34a8>~zWu_3YXVXa
zW7FlVj3j5h1?8$H&b8#FK&Uif^fC$sMz_GBRmviQ5U9AYvJnFmHx0}?6b>|rv@tbJ
zbC#;aQi6$=XV?*7-?L=h^@b+AKwDZdlN8XslnU;eN*`E_3WW|!lTgyiwe~4}0l+m7
zm3l=Jj}aIZQJssCqhN71FbKsGY($~JPJ)C2T!?Xwh!`k$v#ci8E>VEJa&o~l2`^E)
zpn<I6Y@0SSqy#R2J>N=M05Kj>Zn$0ud_MQmHsv~NkzSnKbz}-d;<v;NSd-u^&JooZ
zcQSJbl|~zw%t)NNhzuNp#c6>UUFN)U!EF$%Ft&!x6eY_@rHuitUIzmdT>SzHfN_rE
zYTREK;Va2$tG8?1vaR1-+8!sSO`e#k!>)stt*F&25vm+WW?BVa*)FL<HL=cAQI{GC
z@y+S;QCZdx87I`(!1|!A6^LI|z_^>JFoeeJLQ7U3COuuf$R{mDpfw4e&WUII2pL4>
z0y)bKRm<U{Lw7hCJ=4N*5*SrnEg5!hjz<fG4%~%;TT}AigHLh5Bv|9UQyxLf`=Hh@
z)_wWV+ejarL=LyBHKMBu+?~LG4X`$k8;1pY<-gZy1IEpot=~Y;!Mw5<^ma|nEohf3
z-Qd|aO$47J*7cXD0GGM-7nJh!VW6ez5*Qa5TnfK{yPx-htPkD{0nIbT^TMH8bxKfL
zw|z4z{$1UAR`(azo1K$({APj*D4$V>L1g8m4$MvMT$_ldW(X<PFP6YrD}HIm_sHZu
z4;xi{Cu?cz=03zo8s%fShuFXwR+nK~0rPC&pUT|I7J3*ni&^XUz!a2=HGoPE)Cb!@
zVJW?GN=VBd`cJUn);y79axBA;2qUJDGtj`$5RUY6Mx(^y)2>fbsK(wiyG;cvQe=;D
z6{>+N8cC_uS#*xVK*@-8wB5}h_P29>L(ukIKMC}*h)B`A(k){n@h&k}Pf{I70VX})
zPKJWENqhn+zyrzYi;>El(kQNxb+k1IDx8ibfU2M1i;}^E3okK<+B7CGJrn&S^RxgU
z7KU<mH5@3;_@6-Ilm*!6;)WoMV0sv+byx#DhX{sBWF<-DTY(26aGVC>`&OJhq=Z~u
z9WalXK7z3s6Dkj{BVz{g_#^b{zRDB0u~-5mHPPIH`XvV7GRH$DCq+iKT?l?DRZb;>
zG1AWEwDI6>M+d^#AV!XC3e{dEq;YCBD`j;|ob3dxFZz~Bp1nJT1^5K2454-}p%SX0
zub>jH%mp(du)yl9&<emFq6X%Ur>s7&;qen@241@22ohjQB2Nia9`hV9Tuu)skYfxp
zfL+*Ky+&4As4-L(gT`3sZ=>gIV8VJejq>mXyk{H?Lq&t}5CJbo!I=Ol7?(hElSo}i
zz7*b;RDN`K!~;?R!QX8(v*P3s@F^I`q7MMhfYzIvubI>hhH)#Wg;dAG*q}q?00e0U
zL^=eIxaacCp)}@Qu)=({7Wp3dYBmc$)`MCgEPF_L&XalMI?OCdbJ74;K2klDytEe)
z_eGBz2wP85RM!V>!gz|Q(srv$5c%Znd7;&FsQR=lo=|z70A*KE+fY$kuSyHo>vS%z
zVb04XV=+UN6HV(n`!EJ`If*ceC}OfwuPRR~<;4FwofKgNPeB%GiEc;^(OawJ)Htl#
zCUwm$0}T!W&^urbXsxRzPdNU>iaLoh5d#BR2U?0Wiln7sfj{xTNn?&&U(7%>W&zkD
z28ao8&%jworW3#C%_#sv+rVC%q3N?=KJIfrv6U5-Mp_7v`YZ+ku)!d}Ue{DqjFpIo
z=;;EV;5dEUXN@Qt0m9Y9CD%1lsK|{%@XC6|=?8PSyEw&?NVv44JPwPiV80~{Ze38W
zWE}=C!xzvnAHD13p#lMO$ck%f@kPWuB6$EJ3JxUhqCH^lrA8`QOhdw&#7witaHCYW
ziT;{ID2}skmPdHFh>T#moE3`wAgf?v<}!oeaHCtt_(&9E3dkTv!r(8av_`;n)$91M
zFo?9lI3xD;c|1sS*^t(~`ochL;B{pn`ndZ9l+)`iluA>%b>fDkfUN`lW=vez%0bpS
zfxIp(@)=;z!XLC<G*8Pl?reEliPtCAwjR#=*s~JpYT!(UDdcFP4?jYf-U>J#!3S4B
z^)P~%xUpa{a8eDumk*P@$N(|^I>h>Bgjexk<~&=lA}qW*R}cK@EQ`5@-lxW^L3cP8
zoDL;GSg5i0+h~<VFo5=&cr&PRbzWH{se+5BF?XW^>aD;6JhA|OF%#v7_z;xSk#Qby
zC<wevt%f+`0$&lE4uMHCP%tL7OJq9><ByQe5)Re_=UMkZH=_c;=9-dlqlchslxynW
zN|e!=V0KO2COY(xNrd#csmJ1Z=@Vv_Qyt<l9;0%fw()rsSrg=Y^67|;+4o29Cg>fX
zhy`i=16-bUH2b;iWw4EL6^U|xx9o4stsYd&$)E^bjvcOtR*wPQjkc5)=c;Y7&n{cn
ztsicPR#dZnYViA*wE|lE9YwOhIo7VB2vu}LHTZ$I3F<s%;lI-@u%k^pXkyWI*?K>G
zjsq?S&C>>8kpa6<iTI^tJlu?w*m#41<CdAUGM{ZHwhvKFB;3IPQrbi=XGSo(;q{PN
zVbWH$H^xrc!ZYaz0!lhDWAggI7!ATwqluj<cg)b3?7Ew(weD`Eh;bwzv41;Bqz~9x
zT|qF-kp4zNTfL6GR#;hN=DNVn?S!nnygdYX{3h@V6nL0dgaN{cCK82eU}5HiYZ54p
z$qO5+rmzAQ*9hoN15O26M}+vRIvzB7i{twA5lM5o7PTl|_B?p78IwwbsSpl4<0FCb
zXcEZhg`ZgX`?#u<RDqYXK_)>)BftT*+9)`rvB8EJ#0AfErX|oZvGOQcJ;Z0|RXocD
z)#lB6YLfQmMI?Yp>lmH{lB361Igb}&;7yal8nL#rpq5<}PKN^%&HAJ?Np(mz7zj(b
zhcLHwol{t^+vvQpT~dR})Fj3a8TNC4YPKukRs^X-g%9hPVKk9z;ST@-<}lU8?@=tg
zTrL8ASZ}qUlz3r=sD2ZbW+sJIHbpC0T{GYS-XfqfUJne&d$|i0mlvRkS>1<2-<&M9
z@Jasz+>FI}4V^ddJF~AUzshq04DwO?&*QjiZdtws+ZO$SJ0(og<}#~C#ygFHJER3}
zHDF&R1%Ir;YuYOi#=KTGGps~fLmFpB-FLTxXf-`UUd@ZGX5O=DeM~C2W$ZIG;^1IV
zA`ojp@cn=cTl7&o{?IAL_bU2NT_;MWvOh&^O646ROLc6;m2?lH`FZ+;_J>rvhoYi0
z8D}oNX<PSGO4<`1<XijyJq#{`X(5vdr86u-nT4gLpTGRF+jj!~*>-tN^+a&yj5D<I
zCaqJu>R02#cMl94N#&TCVXUZ)vx!1e72XX^vQQ}z(BN6^BL0Ov&rvwuY5oT|i<KVl
zN~$kGvj@>!oD%}4UOelECB{R$*D+S$b+yZo>J3q4jERx~!bp{AgbT;kQLWu(&p@kI
z-64WQOJ$LNxP-;tm;q4QiGkB-3G}V&oi$SH@k`k>oD!<l_#yRd(YrXqkv6kIO#C!B
zo=E~tG~Leoft>YmAtr+vG2uXLf11w}2nr_o8}RXzFqv2q<oSl{#};t6tkr7WJ(LJq
zP<=~g(0Gh5>~4G@lcl<y&orlcsf=wO+*I%MJ5(Z<Y@h(c1OfxIl88|=h}mk$sOl*{
zluAtc74`P+QsmsB<Z`KpY~Yni(*bM{IXBi0mW__0m;K<9;Vjb5{7Fa9%_>7hRKrrs
zio8Q;Hb@C{(Zf8r59e2zVtZyjZn~LET4)*^+auWXKueC_J0_@2<J7n8SL(7DHZ&l+
zXa4~7yY6@8DU#_G+F~ATO20s;3_E0FPcmINNWT~U8=ja98W@u??4ykDy>M3|mY-ck
z*;1C))Hd?|B3B@{yl{^y({suQ*z{I?g=}t5)vVm8Vw{7g^>Ob*IWhJ^h;n!umi(D8
z^E8kHD)9o7hcak^Ae4-e-y|d>^I14KwV<l+4O<t$UF((E_2xiHXHXsK+jf#Lm#tA0
z2}^WNiE{<6LB($w#l?Js-Kgdmx&!WOvjk3^esPrdjGNo8x>h_`2oIthqmO7Jv!bpd
zXxjVI=~dKH2`fV=!E1=p(=L1tk73`D6?KJu&32ts#fR!#V+8t<H95ja2~1&q&L#!^
zc4Xs)rN(e!;ZW_Y3=mj`jTlT&C4wYw2!KdKHK_Wy$^-<6u1?>58%<Oq$RXktH;>gb
zkkFW&@#ch8YBI8n&ygs0hS256M7y_^_dbc(1X+8YQFlvyeWAtopR^sleGrx3WPsk)
zzPl<(Na292N5|lxbY$Pr{tJVYaB<|qeEV9=z5c)irG>VP)GnBcGD$W&sgpj344tLl
z(R_M2r|x1RSingNv-cbPq4Os)*)yQLR$=3vX6$Djl3m$>VY^WV>>y6fNO08S>gTJx
z^>Z~bY`b(sPJ#5yRE~!bezhb9xD&C+BE})1lWuZxRX4Nnb~3qR{HKhhPFf!}`|Zjb
zN`s6FvXM=z<BB1zV&*?KVRM0+kJ=K|H!^m9oZ)Z#*_$wy`%0@Qo;Iw2Eu5@u5^VTl
zE!%3m@fYZY;~c&X>A$^cXEYy=_npk>%_fTtEWb|YT7E3BDZx|)a(w(xB~D@YUXXs(
z_OMM)r0tV1wpBxigr(s}rM8XDV{pdOW;I!D+r<9r?ySIje9pRn*DgWQy`M7nruMo3
zp7xe6+NL6`bBtvn&Y^%Owc_#l@gu%G%gu9R0l!Y`7oD9)lS#2RcP+uIsPvnUO7iew
zpzr88s*6~juaCiDY%i-M@mKo_4W1U{3Is{&VkIn5I-?aE414>Kc0C)eIvtF8!F8;$
z^z83GPa}H%gI;VNa~sb<*xC+vZF|+`6PX6F?5_wj&)n}0NsdDCeaeCbP8sSH@y=on
z;;0G*b5z^r8nZg-p+ebsUZwdnW+kXdwcXf|a$fvdE#x^Hy#cJ@VY*x&nRebcK3o{V
za1}WSnv$lW%pFl?qN}7BtL2RF`o^THZ=U?!;r@zH|5qLObzSIhb;y(?Z#-Bk{%-Dv
z4022yQun1M>j`UQe{(x${97E+$~i)fFVHkIk9p_9c4=zR1iTkwQ|OTae4>?X)^zcr
zN&fk#TH`Hzy8(~C<V<M2yTpSmmXbf;!loY|)rYBQ4D>KH-<}NqOjNGnY%-K2i`wCq
z)cR8oAwKqaQ^0FJihEoWz?O2aJAP4cuJu{M`0W=HR8bUtb-8=iAGcK(ceFY@|3Kk3
z%KHUUljR{_X-JDk>Ii-_w#<R&x9i^$-T;L{xLk5#%0}%F4>2xp&&e4=zkG=C`UeP!
z{2%^?FGoEJOo=`m^vSrL7p=4?@L$`DC$YV8c2;?A`f{AwrGL-bNcK2se=9?W9@2NQ
zThe~<fDb_LLcr;*j8tw_9PzBgMOLoJ<@o){uip#ND=U6nu;7`E*YD06QuAj?StjMz
z!T98kEq}&uM>K6!anMlecoh8rTmIvsr~d$}r<eZ#$-ezzZ+^vye6t&&$cR-RQRN<4
z<?~+X+W%;rDe%@$MIpnA9|WjE&n<$+--GjdYc*zsx!_i2!2*b+6R1doxCDX@s~pC`
zkBJnJ<|Cm<Gh*hY(sVqfgdvCQx|Es6H8R`ft5$Hhn`09%IEfU2y=7t0V%lBhRs9`1
zoDK2^QP@V+;yTO2z%3{x2ttP1O9$Ljr#ahQOB<-=Lr@Bp^YP$QC)G&<_u=^U3UYp^
zL;`}Y&WH|UF~?zwNujJmtdWF>{p)qYfty4+96^45#es2#2ZNC2Ws^8W+9}^vRuquV
z_9=mJKv(s^#RjI;+bQ4`Qd&0RLp6mN6WyNhDbW&`WS_>vU5ZdLL$nUL8>*_E*0zl%
ze!M$c1t`b6i-yzm-hZ1pN%WA`1LbcXoxZ~GRA+pnY_#M1(_p@xTd?uE4Uux38!mX~
zNA8r?UI}tnXG(?9`|5H&MLfe^WT$2n@(-Ys|H$KWZkN~-@eD7fGx*|<+7?=>tHqee
zI!wroJM5h-p5qER_fTsw`7IA0F;en|XKzm{O1RXXQXV<$GaY)U4FT7iS8kXZzKCKe
z4n9-cI3_@fvwT}xVk4B=biP;8T|K(z_fi)jahw+vp#J-b<PrG4M;_;_hZGGt36LuV
z)K{D45zxv+w0cCGJ9_CGmM=z2x?bjPnap>O?Cc(aD)iA^j=Rq$E{Li0io?cVOn!Iu
zUBx<AXa@Ly1eW?fCn@niJGF`bMH&}Hf3Fg(L9!RhPc=5JdH-(ws6DdK-nCj9m$}dd
zRx(ogmGOglk#&tRif`kG@|@$RAr+g?lxW6`cYh+shBpek4^w43WfJXSp&|Wy$dcO4
z9)WT+s^MOC3a_SvlD|Yr9ppor*rT>c_|?#6Si{oclK`lNAiZ$}&2e?Lm3ZrI)>i@1
z(v3q{e-f#r_Q5L<K~iX^S36{;_vPgI++cxAaE5d}!tbpsA2XBRGY5pfB9ymZfYf-y
z^Q%$O?HX_pBQxo1>wp8({@oz3Z=ax903aWD;ZBb-_Sk*<L!DV#^)xqkSmwzSHIdRJ
z&-vpDb(!F@zhl9|z5$&8JDGF|&&=epu~+Y<Ha!}J(|LvB3Ctx<C3uN|*$<{Wf`u8R
zT&FkLBLJ@+(_8K?S5?Z+cOh77ETipjv9!xK&BG&X<R+r!061C_l-<9>NrF}z4+2~{
z>>d)kwbf4XAxr<gno~<P*bd$&sLzqc<RGJb3;lCtfnJ*>3mL_@<i|yUs7$Xz3;Dxy
zFRkhVr>8sjt@Ip(XG_0eOsQ{bvLL^{#m`Vt#G7veiYOY9wL|_Pb<zt;p2VsI384_d
zmqEe9!%rl0F^3QT9KZ2ST)-RP^tQn{=#vr~DUgh)bgK&nvSrkh{{;%=N9UB~oAS}`
zZKGDIhjc;NdFlFf@fp%u7(G78U=bz>Qy@!s@%~;k@_083`{<_q=~?}#fIWS$R`M+Z
zp)DsUSh9Q*vj8Nj*nVMwwKUkz6y$nww1iN%;7I??cM0E;P%j!8*J-?|sJsDESu)ak
z$Ss%!OsDiuP~&%}sH|Onmn}HDFq>(J*dDEF6@Is9z@}9yqV%h@)x~+rgS)_V@pp5l
zV8zH;rMZ?RV|1~5j)<z%&ZXe4b!!LmiKgP|pg(WiRa2~jgJ*}4%we(f-Ke)r?YSa%
zybY$WXgy<v?Zi7epQqfbi{L)JlKyQPRMN*IGN}4#%c{F%<A+GL|LdRm+4C$mT{oG2
zd0?h&p0z#{U)nje?$G25Yq)Uj%&`;|ifrn;+Q)Xk%ch{yg{iLc0Aht5r{0+T^?dQr
z<;Mp}!7{TkY(~;&(9GR|5rcMkhrg-!&apt7zCG3WYkJj7d+e+1uM6XM`PgN+Uk2|x
zE`H4{d#@9nF=Yj$e?EByva838C>d5JfXcT^kPSCPBcG*P3{+_HAeaYiFfLws@fJ|V
zn%n$7AI*%ZXxZ6bpc+MOU#-{$1tGJR*@B>!<)pw@P-nxMaah`9pbbZYk&_InQIg5P
zaEz^M$fT3_sklR;dH`|<9>4ygiI@s>DjY}hm<0moWH=a@z{AKu{IoMxxQ{rZ%&H9b
z5VKWa7de4%kSDp&2Z!L?=S0B6;8%e*O4dt7*Z%3sEGAA2eyuj)R;v!IVg&G<R~Dw4
z3}Z2i@kf#m))^_Y)}VZ_rrini+~5UYJF1a_?JS<>p}f2JyllBETYRwX`R07u3lxsY
z#C>KLVnmM^uFvBU_HP}CsOiGj-|3Rgc8Zk0((7-0<m6tk`bAc6ID)1mT83A(zFkRV
zEJdeiGsbRK-#*)RYXUqblHeUI{<fgyJ&UlX@VoD&!>#O+WJQy<@#kUG(;M+lL1TW1
zC;vUZwG_n9nOOEd^a{Bgo@wQl-;rLX6B*pE;}WZMKQAJ(^kY!Y+feYWwI>t4y1f(S
zHe~^HW0wnA;TskJda?-xX&t#U)aAx_yp#6y$oS~!puop;y;d`eOnmB!V%K1dCZB6G
zvMXmP!~X9rrbms2<YQcjZLWb~vgK%bRf>HB@%M4PlV2c!v_&qxz34rD#bMz42~hO7
zneYAo{w77*)#y&@+DY?<d8yeP+J@>lE3ZEL;Db6~3qV=wZh64p=^O}reyNZ*!kb`D
zG3DZ>aC)dCJ3k)>xc5<%+sl>gv3}r8q!hkLwkAaH=)48Z9vu{0Ch?qO((My@9CsfD
zpU{2C&-;l-)gu85{Wl`SlX+j}Y-InP`4+8kmmHWXIsOj-3C`nXkUtaEd9oMic&?7T
z$*%PFZsuhtJgVc0;c|nsq>8klXk{GIPSlI`r-$$lYy^Bis|%75O&h)FHHjQ)yK^Jx
zMb3MZubJ<B&Xha<0d}R|3<r;@!HH!gdrslW*QCIwuU2U_KZ6ooLr3PSe_M)WdB=t0
z6HVke<3CYTNAWbudv^Qt`kP)Zf?J(=Wt8CIs%q!u(<LW!b@hL~Wbgj6F7M*{8cFI&
ze(og0<GqKNkYNQr`D@3K6SzB9^W4YVe-MS0Fl#Phk0i}1rf!muOX%i3t1a8opqBVj
z4ZBavxsP7Ba8CD{^!S#}Q$C?eiHfw|`tQ7)aJSH%U9KusDABC3-2j7otj&8=E8y1J
z{N3t>?iMKrp*j_bNn;|!V-jO5nJikQ3$zmol?N8YDYjNouf>X0rz1lO&07+$S@X4H
z|FlLj^SzY9ppXE^%~($8_P1f_1`BGdr$o|WkST6*#8Yc~*cd5%XWTL}48N%(7i~IZ
zU%DnnLt=iUGH$k2rC`LRSt>30GJfB-z#)sAY%08##OImu5)bozTFb1+R#H}!Aw7g;
z67|fEi&gh)D=YfsTeHPiG8=*C5>*&_qpj*5A!x+6xj@?e?9g1Y<0slBe!BaJLn=#z
z?Zkucx90+5A-P99QfyiuFukTku<eR?wIyP%d)*<6qv}0DVdni3fTc?j-m_^kXQyY=
zZI8d|r{{lA#7!y`m%mlO{{vL(nR&D@&x3k9T@$>sl|J)%6DeLrCa&0x-ogMHfdtJ5
zA?%-3e@gOYj(|@X1IncDJ~dpmi~7s2BbU-@BRs~<5*&!sgH*_*_a~5ZMIK!Utj+z`
z5r9svdRCF;Z)%(Ui|+|%EuZd<<PH0@oY_%ogXrMdUXz&0mP7DT*<`yfbyh_7s|d>2
zCySS87}9>W@FvZXa6P`|hVgAS&D6A=qmc*WmEFwE_1VB#?XWSf;phmi{;5aGZ}*=Z
z7ainViuw=r#Fhm*2WeP&a2T{XO(+=5oNfqO&$IbEE17WKXz+M2K=AiWr@-^Au}^$6
zId1qJBD-74dtukplQNi;-B0-xdt6h^vWB?4_hvd;V4hy-MCz;7ko#1rH&0|isNs_P
z^KwIe=);oRVt+Yn>pCBe(cZuE#(Dl1>~a>P@^s52LgiYy8Yxqmy{s3Jb<!JBMIl3K
z)lojJY3YW8m=e^pZekM{On;wb9clZ90=%$m9Be@D13|EABV~EsIB;^@R>r!IH+@Z^
zt&b0{J9<stNYU3=MPbMGs?FYLM>*hFnvNRTTP$}|<{|=EDTr*U#pDT9z@@?Cem6AW
zPrJLvO6R%F_!Ud&(uHy0luk>wqH4tx7!gwURfElg6Rg>0I4>*7c|&5TDZC;RNFbpH
zfsS+UVFE}IpX>|#d7$;EAjaj$X1h?+cFBN5Y_R}(V_GTtYrLv}UDd-nV4^WX36qzS
zb!2&90ld!OK5)DAO=J|$Yti*up_`LKqNefI+|nvJZ78r&($hAV<a!mQ6T7c>7hVk-
zCkTLk*U^4NMIbGFeTyj*Z7`)qipH-rN(4P@R)kH}18mtY>@Bq{mPI6Lo`bG0TFZEM
z-ja+OeEp0u7w!0=@t}lORQz`b+RwK1cbM!bXND9@JUz6}65{>EJ!J|tS(z8EG&t@(
z_(1mN;R&6!K@fG|{vv0Y+{=d`-fb4iSUJshEqUmZp*xRCr*6DH{5ILv;zRpDMX%=&
zo7F^fOx^8wTh9}1g*W9wyS?O8=$VBG1s>S9)`LU=21OMxI)q@b$I*3Sr#ymgk)@?b
z<QK*@=MM<hem@MpJB)G8z=fR}IDJ|pZ}xr0<+qxpGkCGFKWj;B`hs=YVxyTDsg5iQ
zp2PkI!%Vw3pV5A^v=w}Pid9Fo9JD<TA5r9p3tVEciYaq`2WR-n=XOG&lm$5`!RsvU
zAsDDEF5)@L{XU2PE&GFfEt{jdLO8~A3k7JGZCGT0<UUIOzFAXbLby1A-L{UfZNVy#
z6Xv(3KEF~gtF3denUJTQoPL+SAsVI*s4DnFv(UDY^68_(@NABi?1OE|&!|_kDfz3O
z_r59oZklwAYS+0WVR&*ZcJ2=Zi<GIkQ{*&CsANW0x5-u?7eh>y&do18L03m3y^2Td
zD#t^|`kQ<r+1m<dW0Hb}Teg)X&-8p@`vQe5&j~Zre}G4_GAcPLJvr*4kNJmpo&$1%
z50Sh@gYUyKOF}sL9N3#ap=s2tRj+8j#d2UeG>vAUTx;n=kLa_A`c*Po7M(G!PXz96
z34A)M@P`9lTkSeMVfz-Fx2g7(MFf}fSdN`JT=ml7);Gd(H+Z|>g}DZW<k$*aIda-Q
zcsYO{{hRMr^KlEWwsC@+l;M%n@5f9jGCQ6Paje`V<ERe22(*YdEHu_GXTwx~ucWP)
zDRgBSj>NQomKVu028f|u=QgPrEtQ<5mI&ip!r8)r-?_Us69G>7wwrvIyR{3dj^|;0
z4PWIba;H^Q8?a~14R-Sc@!T!#US!+Qgb<CU{QfB}{qGt-L<`xD|8%c~RZU;#z79^N
zhuk?wcI$cYu#B#}564oXV?mC8`%2{c>$@vn;Fa*7m9oJ`O_l*VZ;`GW{w8j=7DG;V
zHCgHK&L{M?0(wIZ%4uz5JDtlfOzzL?DovCGPXm4*S&I+4@K>q9%W(J7e3|-IZb~Z4
ze0q2#RUB4+3+=ZSEB)@VAKF|<buadh-Teg>+nYD}<hf|0NKXn6SjX#4J6={=#L8J6
zzruuEw$KrJ**t7W{O$@vO#SDOH#&T$uR`us-ShZnd~f6y_@NW&GG7#|7tMb#c7bi_
zlti~b5qupx)ruMOzL3M&g>6C_^Y4G)-i)E{w(sG-(m*v01=1IUmU0SV>&)JiO(cGw
znSb3RCE(8Ip(-AEnV=<IqVNe3%-Qu4m;B%UfyU9<eXGUmtGPj-z@LmSo+D&abbnL#
z?F)ldKI@bD&%FHk`|aBx4$9%E%~<xblYRS&m2se_Ozec_mKBk>YfqbAB*QB9m!pK^
zRO@d(Fk9W2;qMWRBp{~hYQ0h9aiF|Sael-`P-%u#XB+D85;j&0Wgz61$ouosd*KDs
zN4XnDlB*GJyV?H$`}H!ik56RC;?I+n-BgHOV|xK@SNA;w%hZ6WmbRVzfnUdi{|4H=
zJw0~UDAANvgiHm|?Hm!p&h29nXb8SO;B9uP;L+Ks^4MO12vvUH@gUPkz#3VW70Hui
z#&6TBUi;M*IsU8;@-g$fs0o0@S_iKjHf+`Mq4jNQU|V#cU3KzzWlB+IT4{*TsXNWC
zgQN?8+6x<F>ZnC=d!lc{y}NMhyZKzXq};Q*V-M|ow?a;ej7OQlV~`{iMqx-Em7_3p
zL;C#$JpP#EqXbW0$2>!dF_qXl)*C4zs?}!=`5+2VSLAe|#qkO3OKlm*PHz?N$5nIT
z2*x2d$nzlFNEJ)u6bKjeD7<e$LDVV_eh&M#j<H>sjjv3rsAZABgkxzi{~6wp)Pp7C
zZGP3}<G1*eWrmC=V7WtvcO^1x8O<zaR_M`f;m=B)q_85GMk~(p(PD%HDuNq4AdhfF
zuza?8n}-g^^W!~S2?QZ!fxXBDKl=9xd7(J+I0dr-9~+ZMbvWHNMl84S)^(5H7c0DG
ztv<6d$1G(az@&=t@|iLcG45Kg9Y+pgUJH%Zr_I`|Btn^TtpW+kCd##*N6bbK(5`l>
zsUoi>&8|)Gb{ge7-uw%7DbjtHkqg)xzXz<iWD~#Nj_&;jkZ$A6@8T?ZwtWHTGrB!)
zb#CS1eA8am&+N@Q<ONgw{%k5<^>0i*460)EyDCz^CRI(K?7@!nve7q1p|-bUsvr4^
zhfU6d_tgIZN|7Zo@8jjrJEb6D*FEi!exb-P?e*8olP}`g{27Bh`2Q?k>AVb4`YHpy
z9$JW*v+$>;|NCG9Qy%iXsHkeGUz%egb?uJ{CC?mKoTkF1@n%ChZs1LQ?ewP%YXgc4
z*sVrf&o|V{gi1JD4n$ywf*kiIVebwtjC5Pq@(J7S6!d<<ID&WQ@#^^rkn9)zw*;c?
zV@oIy<HVO{E1f-ESPG2%p>$OT5%w8=a#8ZM#yi$kon>-9t5UVtctB~^_~wx$>b3YF
zH+^N#Ws<(_wdZvBDC(Qo!hk;0AfUJMep$M0>7%eL!?Zb-*m=-~cFp2ZJ)6_yltp7R
ziyckwulb@4)Q3R&2#<HZq;_6T`$>6r%lNS4Yx#C@`VXMBH>hp*^aHu19#00#FOuR4
z){|O1)Ac51x*%#KRDPHfLfOMkE0}MhdJt@EDw-+nLDBK_ZwdYAJJJCyo7Vh5ZlAfQ
zx8yMwr-@C5UO{KKuRtAYN#M#uGZI56*cr<+c+5EI=&ICCcgY+(j2oSk2j7oYoVWa_
zkwK$qHXe8iIN7xsjP~p-Rl`0Mr634O=^TWPtOGdn#T=N~<x_o2{#!E@NDA9{y<z9x
z{9n$U<28@1c5^c)E|2a5ZoxnPM&@29nXZ-PiKL4v>sN~88%%!>Rdp=>$mkCpk%zb~
zz7{HTUO&A!^rI|~=Ye4J`(?`D2m8c@?tP7T*%W;;Uyl`U%EW!x2gUSfa*cTkWX5y{
z!&F{H6f!0yIeHsyzSAcOx@#G$^%9aN_U<>&Q&k1WyLR}YZnx+O3^Kl%+|z0`apvsm
z@fqrNDQQxDzFSQ!F;pgUU;FF?!;>#d>{Y;>`cFGcgSwWI%Fp~0uYQv-ri>@rdK3Ub
z=K}YFcGzeHTg|wZmEPt2Eujl*rkz{~)3d(mUv|p``t#>^K3RaYf18}uIXEB{%YVF$
z0T;4=nG@Gi)qxW{?z_1ARaO22sE3MsLRD7_<TgLEhpJ9#zh&8rm5z7!*>N1vFs3`N
zZ@2#kK%gWgx9sJjk7i=3F~Its?2{WhnHNLe;w{QnA?AKLOPc)eS+?xXj1-+#sI+W5
z{+wihQr1jD(wMa$EHq?cft!bnItrOT>eDg8s~Sa@PdClQp#5Be8B~b(Ize&F$*H?e
z^riPJq`q%^|KRLFR#dKLFi`x_8E#2f7$TfWPdOT+6d`LwM)A*)86|%r=dno*=!W2l
zK&YVx7R|~Wq{Cr1ZkDy=7nX2@?l|Fp<i!w@rqsDva<W{%G|ZitODd&u?vCzpppcC!
zd%r-cAPr-`SD4uJ(#zWl2GP{i?z%i(D2#m0U3Bk#(q<l7;W9oa@NOiA&I$YSoXQ2O
zB~Qb*;8hlkrwKC^g6NLRB1T9X9lz`+ZDxJ$wYJ|49~j}a9Nd~<xvkwf66+CvH|%Rp
z6}3KFJR|Lh*pq(%wqFaO|J4VGQipu1Nfv=|#eZAbxv*PW^+!zrr+M+ysyan~+pvB!
z<IvtQqh}Dzg8u!_LKT6XH8lcdn`{;ZleG3Sc_;4o(S**|6*R9;&g<#IMBh6|0R_8F
zD6~K2`{}<aaN*SC`v-V%#J7N&eBSClB2I{VDB%0E3Y{`x^egn5rS5g~I+=~iQCl+q
z_^5aJljiuGQ@(A<Tt+*Zw`}nWiEcL|-ncE&^)<Mo%sRFT-j*6N@w>7+KMPRE_Gjy3
z;&k1mpv}l}-CXOBu_=YpRqcvYwV$FVb#D~VSO?Z(4qlr?H<l*1bKa$mqgrjEh^Wfb
zUx4M=sX&p8zMYhxI*l61>5t5dmi!zT(J^*fnYfz^OrZ>TFP6ms&1){v!FlKlXJz~5
z^u71|I3_sxY3<bQ^7lgT%N-4_Ch16(AniO-Pqx?sej{hBu_pheotOpQU7_c5W}`!P
zgqG>~t3{_~jsbs>XMCA&0oOJwY@jdqICBuQ@m~KUH|fQT9x25mYt?#x=^hDapm=SL
z3>(}S^t3zQfwMc_o-<t8&5H8@4$?T}Q;QL^0D#xJ@TBb;8x%2D*T?NSdAxvz?FPoh
zy(Lg&$i(uDfxbhiE|<S8@EL+lVMs_#m6Jm4Ry_Q@x`UxdMWta~@xG<cuUb~y<nX$t
zLvKhCo`t&go;0>L(V}3u(Sli;a~*eJw%#T24dp?vyIRZNSuUr%Tye|nz((OUnSh(S
zHqy<mO0}=vOtmo^;u45S@2WLjukg@=JR{hJZP#SY+i%WsRpr8k@b)!uvmr5y%ETcN
z4A-oiDWiw!rv!u&!Q9v|j(B22ALtJPrg9i5g<iLsuSZ62ROb@pJCs$#D@HKsi>PqT
zxu(&210x9+3@5Tb1H*xt1tzBjI{=@yIo+hR!|}m&&AD#%Jy6rYs|Nm!<>~d@reR|E
zPLWf!<#r}W;+d40LXO{4kMEQ%+TCW%{&m)9I~SQB+)O;Y_0ZRpS9mz6D)CorvT4MZ
z&fBJ#;SlyapNxOMK0DPu8y{@c)J^F~??3bU+ttVr%Tf7isP&=tGtkJFp%7^jwo!ys
z2)A6Gwr?tYG`#e!@4H|ce@yQqhl4vG^{n-FPlR>wJEzL|e(PPL_gb7?IKMKCzndh@
zg%8gu-uVb&A?m!M^5fRKH4c@-^#{jZS)(KAXmnDj*iFDm0=9pU4KbjE-GLG2&laRY
z=e*`&E`0wkK|8$nCBgM|Q@T6%JTCfum|NKnCY+90FL)olabX|)*=$3@kujE|Y|9D0
zSHV};--$k@5S+qIezksV5VT?KxVUMP35&h_P^b7=?RQ+W7A-vU<ktejuN1}KI$@1p
z-`fg)<N$9=bfN2mS*3KftD2{4-Yww9-pos+6P5d1di&ukeDw8!va}Vrmz$SeLv9*>
zx`W!g%h_K;1T>W*<Nth%RywfVxT#o>wj!1MhO%0}hj%ufU+KbjHLo(GHWZu<(~jUW
z8S4{mZn!uDS@iQWuZ5`um;BTWz3zBME-LdER6Gupb{PreX}&P`@X<<hD~XO~j&d-m
zRv?iNg>rG!E{~5;rQVP9ekZ!8fx6L?j&0L_ULp2oISD;h!k%8z@laH|tkLD!Lt$|_
zF$<;_9W=)0LYtW<2acDo(xbftWoM0DRGz56o@CbJZ9e#Xbm@;!<gIu2G~iX}x8KTc
z`CO{E_B(Ljcp$887Gck@IxXuR4k)Bxy2$9gytJziNttJ(K8L-N`Jt+_1JtUI2wxG+
zV!^Md{e8E?Y4?6YHqbK-yXm>J{qubK1pI=Xw3KO`CL_S9$8Zu`7MbMST4=df;UOxh
z8}$dt)jr`T@<q>GK5(%{_3By_5-o2C_l;_)v+9iEJ(e1>sn`-To^WK44A{I(+;TfR
zw9nM#L?<!=tbpvIQ?W_GSNq!c%zP_q`93z7&t{nZeDJqR*?4u{iy^ZIojHPY$t$~6
zlI3I#(D3H|qf|EDEW7<V?6=kU^@K{p_Mc5cdGA7cTj8@;V&9mxTK)kRI0>s(|J_T}
zcu{t8X}`7j9lYe8DPH4Fb8W|-u`D!~uB3YFL}Uu;Bayy#TFUBC?83w^To<SUHBCDw
zF|twq?x^)D1Uk&u0i964TxkMFyk)g|jjxf6rZZd4d;E|=eUwEjR@A}0b5foyYbv&g
z)AE4vhnekWGQ|<Wl;@)T=R>^<hv3NQmFvE<Wxe}LjL^%$`-M}An}4Z0LVkXvuhop7
zUg)yRD^$}NiodWwcsXnnZK<Q_o_7xyNVvy~j2nmWo5$Qkz8SDcADJi#*`12TIi~0n
z;@*KIFBsve<2zcsmkyTbtsa945&jGA36Ijjcc=JQw{7*e0tC4Lp;vptq1z2r-v#tW
zh&V4if?TtKrf4WXI5(d3`?T@KGW$YzcI)GAZ>fh~s`nLxXBa~$-fcijm0J_XLj~;D
zPE18hIKHZfY%I6k+Hl{IjnPXHZ9Ww^a}K}S>F0+0fQX_?gQ9g_TEBvu?}`CR)=Hn3
zu;>+Bi&W(2pUItWZ-obf4vx{$*J_C9Y<%eH@pi+Vxi2z!q}nP*hJANr8LjZ2ordJ3
zrFwzaT8ypljV{tiFt9fNVm(IQs_5N@<*~K;TXs#cAkDW&zapa?BvOF9AM6+yHxo?x
zJ2iv%GjUpNAYnI=RW*24MFfKtrhjbdK)9<Apk(}M|1V88{Gu5`524n2qu`QX+96x>
zs42@s{AJi$WKQ@<@1w!`j~$~qG8?}#xh`YVUAzp-XYogxQj_sDGzsN#n^8Y?(qYsZ
zsqc+89t!TQ%9}LP)Q|#@w)tJ*pJnzJPwKf%e~3{%XrV~_#nUDdrtoR<W9AAQ$BjN0
zH`B|I>$2U3bh11a`t8Ud^7KD1g$v?lM^Gn%fjRz%VoX()wfhHFW4i~I+7JG=*|p6S
zn|;lfn^Xio+~__ZdEMFY^0C!}vmbwmx$$#Y@ih&8hLI9H4HZjW>B!akD?RxF@{A{j
zD-m(g#61&_J!$7{doS7bv1$*@%)IRK+*bU(@?!<!kDqn?%q-3uKmFwfAl6;Gdd`vb
z!Su@tGxeN~YjY8KZ|40zD@%O4dD}$X(TS{K)Bg;4hv1-+Xf(3|?+CP;@5Hg>!rj>K
zh~QVywdAijhV1gZ+LO$U12pIHP~F{jGmM<fBVchL(!a94b{%V9PL}KtGKHMov^BhT
zXKOQoy>%0koD_00rB-kdD@P*qR<>a(brBCHE0?Vce?-uZQ+zaJLMOtE95}KOH!ZO*
z<Z<un!#g!wdf~;8avJFDPCS!Luc327Y9hfh74=H3HusSD%ruv}yhEApDG!BLZF=Uu
z(Vb9kd~?y&8r+=75NNTAVds&AV`9Sb)<;$ZMzVBcs9L(A6#-@{Qi3T8!CTu_VpywR
zTwkJrE5Y_D$u<Pbe4ZcDcGVggLu9DaKgQQ{(9oOgu3m1l9{&LE+>6dR<VP@TkreTl
zFl9rlUWpHtoMOu>CEm4yJD9nS2zoDxz@-Fi?G=mJ)V;VXLT2cFBPh@yAf)q?&Wc*2
za0txAq(=8)){Rh_-&HmL0M2cpTqaBHl+(XOd#z_1)-~=uFAHMwTQyE#ugWrsdDPBe
zP@U7#HRWIICiGtSpu6jj<=V%t2w@Mwx`ZI@l<p61A6V^#6=pn8lcmvUc1XtT;>3uR
zi{P&9?1!6;Q+yA&wq+Yotfltif)=^C_Eei4rYUW*Rw90=y_)^_KQcJX+?2t#Z(Du_
z+3g#1T8btQMo^WX1Um{~vqPE|Yn%)q&_4j#G{tU~wC@8=Gj^?2Vp94S^S22-DR*)Y
z%6bn<qk?V3x~t6K_HUeY{oeVY<AwM$te7$l2tL}mKl4~ek1qBqoWu>duL7brNS8#O
z(P7;lrE61pq~M{K>CN_|*^%D{*`E^pQ%w-!TPR*X)4_TwQ;#eekKbr{J?*#+ty<Z+
zs0eISne?7dn%k|WrLAaUg=Qq<>C_|p;eO#lLhG@?8?t1-{>;~!8`m=-jc`Uq_2O?8
zP7Kip#SU7-U&hPu)dtx-dp0z2W(*%J!(=~<i<Y8-$GtVZ(MQ>4mo5dz8p}OOq1GGB
zs=sSmn00D!&gxH@3DUw7cqdL%;W$3eC#tIFUy<*9EFPNP>g(U+$n+n%0x4QgJSuB9
zV4S@(wIofedEF&HCFr^x<D5kQ0qkEn)|_jZ?7(R~5EfrQ_o}W=DG@azf9MD{YU@rt
z9rFE;)-AL5+RtLjOZf=75r9h>wp%uNtyB09aQ}5GfKxMB?Q*+Cjgpb9gI^;~?{C|~
zhpVi&KG*WM5DXqVzL^W)&>BJ3YQ88$p2aAfZCrR&ouJ}7H#h$#>U=S$WpB{P6ny<|
z7xR3jwZR1crZJmVcXZ0iZU3tNWgDD1Fz0ZSBBMs<@9VjaHqjma8Ug2*--6NcA5RFL
zX6VNv&x}>7ce%o*#bVwy8W}#adc-qZ_l7ZuEsR>=PoNFd*YoMYuxTH^Qij4Qrw0_e
z$M>f%h+2J<(eC-A0Y!&ssTA@^;-{DPYxN=e5w5YR*)y=($0|AIJ8@)JG^b;AN-_`3
zYE*esw3BwOkdH1LzSYtgv@(YAk6zW?=<s@(&6)HGA*#mQM|OQ9*|%>E^nDHhRNmf}
zDR~pjD@CK}t@;n3-8QFh{^3i628|W*3<v$Ji*E=jl>O}I=zz|>xUM&3cX>h@=1hlm
zCAoI@ZOqBj?f$#ZNq1j$Aj4|gNIWqj%5)=eb((nP6G(cx_>A<S{p|aQ;LdR^8Wkb=
zO1CcTIA(3*^z3?j8ZY9S86&*~@8{37n#j!Oy7*c*Cu?ve^aW-~tFpE>L%tX$Th5PI
z#%A?h)%C1geHhznN5`7AZA&XWxIm62u(<8oV?iPgPayVEX{#M-`*x@XJ!j+JU2LzR
zBRxF6=ae0XSuMf?TTKiewLdBT0p{&I$1Uo^>>UTlw(pR=*xg-3VU5xxJrpN`HQ!(X
zto5ycaTYP=i{0JKMvqm?ET=K8F1zPX6#25Ongqjt|8;D<o5V4EJa$9aMcqiok7=6f
zcYKyk7M~rrYj5|eY`~ao5yQ`f{~Rg!E?j>+|2w;lM-;(6fd<;Y_U}A0Z#U6i%OsWI
zKphbY(=vZb=L>lg<EH<P$3<xWp?Qs5>6p3k;U9n~el4k7uxbbfj@9Pb=$70S)T@xY
z<}W&O!aYelx~%c7vaVqRAp(iu#+6xUh|*PkeR|Ue7gIiI#pc4*+`W-{!QfwH_Eqhj
z&ul06g=w}wrW1Z8C|r`NWfHrDQy4Vd6}Ik5Y`Yz<#3Yd<t%_8(lP3gHELk>nhVxT{
zEf=lBxun7P!J48Ee;(N=zq$EGTEgydja-KM(}$`DrIwNTLOgE^vY;m^o`Ii~(0@50
zDOJ&yWmcrMF7AXaahDD0;ERQVgZ+_777v&IvL8~ibBm_iO@H#-(6Mo5iv8&nG<{sU
z)F>*^p|&UWS$~xzBo}tmX7RDzKLAg~&*Y8($DEqO(In4IgVJ~Dh`~^*HAZHxz|wps
zk{XS$Crtti<@~CWMA|FSj`?p-TOYFHFU%QZ_!dE6#eRvg>}=A3s}U0H_Yl;Tuyzp}
zQdi=!ry<Z<c=LH!H7}kx$%n0q+xUT<i%OW8LeKRCLcY5DCsEAo2oW7OZ7(<<<>P$k
z?KS6Xscd7IJfz<Ensbj0-9>xg;PIA0$US;?4@e%5-Ip#b7f$WK8LPp5r_Fex8_S1M
z7`ivGW$M8nH78<w*hLUQ2ijuk(ry%HTLRPVi}h31OQ<1uvi@2-ysOzDs!?jlX`1uC
zfAOmb4l$b-jrzTa6BVeu07z5=0Nzrj^W_Ow-VzJss8}zB>N7!@5zR~<c35Qbv~wc}
z=PK+D1;Z4s{p^P>FT;UnVyH9$>4#sCRb4jD=^m!tIR*qn=eFj0!;0OHb)AthhcX)6
zjuA**L@_}v7$XZaO1rczS}$f@ulYD+AfDBPZx6=2v9A(&PHV_1RE@6%R153kgkW<#
z01IM%oni!zIz_^&8%=U0#RWpixz5;}n<la?7#zltY3)bOk*-}RJB9ZRy~om(Mm-gR
z26r)GWkER4G>U^#8Klh<unF{G(14JbZtAXOXO?2=ycRyo=oy?TBcEw*jlHtlGZe10
z32Hr@(#|iFN(~qt)l#d!v&eXJhfCn)KL9|6{q9mZRRKtxrmpMbCCeTnm=|j*-uT9}
zi!p6APj&P%@(-w%%b=NkSK}&s%W6&IukH5(DxX2)qoFCK7Jsd02WRDbu!d5*sXxVB
za^|<MCBX|HW!ZX5ZSzjU+AHx&SKV{fjorCdgFe+Wx8A*^=XlvzU{hL7P3hiJY9mSC
zlX^}~_74z;L2=(xoN1|e@Rm3JSNzqGOodb7UDGyWuTU8U;pZmDAl2cp9l-)6IGd*G
ztzuS>Z9B@NJ#~`Jo+AJFoC3{vnXB>WJ}nS-z)@u(Ei1s?U*~)7A6i^M%brCC+xQ3F
zo02)|-Ya3?gQ=RWEI+N5kuQHaDN<#6udO9>Ny=yw8;cXqoyXHtw8~7a&XBivr(EB!
zKjcsS#DASNPYr>Tg4wD5^Lo7XIkvT`(~bAi)`f*#*v{kl|CXNJ={#_|(|*#iSmQDB
z4^a1v&Nh4T@L>0JNchh(x%S~9?WuO_`GiSZ%)SEO&ox_g{G!}MS-uHnM)Alm(k#T#
z=4fnX<>%Nn?Ozh_3V5Qa?QB_wUz|H}XyscGYimE`opFaL$p#2s_!xirT%j`)6Y3nw
z0{*76RpGC9r87N;RoecsqWGf@GW{xcpx#-hXhlGtX8#{xu{Q73Hy68n!bivV_JLAs
ziMfAtaC-F#Kcj3t^Sv%l$s0V4IQz8>T%!Ua;g9vx91ljVfgbzLcg?vzf9je_o3t16
z-Wi?={62l6tdG##eVFBa9wB;k-9qe%+!86%1X9KEr|RlhE!3JY+-kO_^P+Wpq9*va
zZs1-<(Yfd|dKD{KTD6gm>%XMC=l=i<LG!+7{?BRC_;L9R*FOfvx@fV{v7d5B{{UXM
z@8=NNqJz&1jC*9&QdYYrtG$txIXmXS+}>KmJ@Rw>%~evPQDg%tyI}X@{Bu{AA1s7?
z+qW_NIQ==QyH+-725^~d`kYZ9iK`5lwa7hNulfG~dZ@`eJhI4sRNOY7Z%@*ti*sqg
zM{~jS8%Amsk1R9Fr_2Yh`OQ_^Ng3MAo-zw#I3tzF>Hc$8%a1UCJK+BSpUSJeeqQFs
zIl%P>pN3fa4u2u*`qd#aPm&Ze0fHABvCwrqdXrFxSjxWF+(r~&<a+%*%|`N*At#kl
z!0t(>Mv*pVnL$<i#Pi3Xt5PAN$|08mINayvJ5&ID4o@W2xNPLNc591*$D+i)Fi&=4
z$u(TwNf!s@kmKe4=6!$K_U-kirOceFb?SH=dy;y7HKLK5MX4?A5)a;P+IeK$06*^@
zrl{OrX{d`Il@LI3sn9X?+B*JJ+lzxE6jEhua!<;D{d%9n6;Qm<lG0wP1JuTwPtXC+
z=4twZK{f5fl5DiHkryB9HW7jN;<IN<`D9`tcOySEj(~ny{&i+~qJ>FzX;}0Q!GAnf
zWb>`y^2_;4*uXb&@3>KN(8-O`#Ko3&UhJ{_x%31Ml`V|YHL-1gD)E&Z{VA~9TBCs!
z(+L-*@09-l<F%<Al1cK+Qbim9<0tSVniYy{)$r_Yu{%CKW6)KNLkfR{t94Q;eKItT
zG3DeQ>c{%jT9mvp?Ew4stB%JlS9fy^%3^;q7UX{jt$Q6&ZsRe^yI6N4IR60aS3h+t
zbPh*7xT=C_A(duEB0zJRSGLA?I{T|w?iHRRwsHwyn}^45f1&2N$=FC9Foqf9Zc(})
zr=?tjQCXlRl)C-rjZO&1b@v^rJxKkcE$3^>sg;#S`qcG5?FznYpxCJd<gUgeJqCIo
zV@l)#Km>Y&+Nk-NInO=Mtxx9psgL<+2OsSZPpwR3QX3<4Rhtad*PE_sBYd?}MKoHE
zfm0!>>e#74s35eamvA*y%d}#b%)-?Sw6EHoyCSK1+|-LEYdJd**)tl6n+G(k!x^g5
zNHA%}Wkj4wNMx#Beml|Ud)A6Zwk??ltvMvB23l2==9`#U)DnzTGRO$3hFnx;Vrw?S
z!b)<~%+ne2)P<^BGm-{8)77}B*nHFjgI;<(;$YP%HC!5yHxz=v?bfSEYG&rEN^wF!
zLg$)jYHPPO9{y?Aq>G4hMOX)lr4w;h194Rt*wQPgNNNL8zG?<Fp>60Ynvi?b5LBRX
zP|U?qPXdq%QPzMg$5W1#UOJdJy=FsT!lNRjV^I@KR}@r$)FG+rYCx28NWy?=tkp<p
zV^RY_G?dyCCsUf3bgHO%s{rvtMun%U<n*Q~!L2<#!$_*o2L#seSJ2TRwpKICFYeU+
z4<fsLGS#AmM5q<Ls!cM|B}F0)m8*cG01EAeK2&WKK_DP;ip{!W0mlc{v}A5iTBA5q
zoYYL{iOk<DnIHi9{t$VrY3$xrW?5P(2V9IF$JV-g+f|SftIw@rTw63paee{*F5if(
zS~?T#Wk&_#5w>{4ciM8UKAoy%x!4*r7zKKxkGg$EJ9~<_VU5>!1-lRie@?W`j7Ud`
z9CY2*oyU8PhIt)E03$pr4CClWU*}fQno+lQ)*Xqe%VHUu%Kh7SQb_#A^%Q1U=UC@a
zpW$^pgYW7+>4>%t8+$62Uf|WB0bQYp$jGX4A#`aMg1nRJr@!Y`ZCJgyW?!BxsUF|f
zkOHf#ql|`Ajmmi+l}@OWFb{=&*xon?>N|Z!F;x}UkKH~-b5pbhW&wFt8B^;(3^t7z
zbZqho!5s}tY~%$DzcP#!>$qo;{uMkdSxi#oE^w!zT1K~TGbDWKHjqA?F!k+8uBa2}
z#`#-f+O4=_gPx<Ppb(1~$-zBGPX7Sbr%kfLxEz(>spxs*^vL{aBPh`Dy}`iknoSKd
z2T-udfOC_^Ju0lL%Q!qPPCrlatv@Uk+n&7R^Zu2cJYXv-G5LWR_dn!Tb4A6Ow=;Qd
zh2v=J_!Cw%*ob)){lNMB2pInWKGjVS&od(rodFp3^!zJI1p?HL?o?!Y0LG@8ySa0A
zRxj?%2PcP@anGj|>wpSKI5-*hI6Xh{+M)K4NFd}nLG}LtIjsl(097;OoUqTf1#F$n
z8nMbq%{SXY8B)EEU-QK}Fv3OwiDg3n0FVGwHk;U7C<GQzgCpLoS&=o(%$fO45j`+D
zKl%-G*y(P>T3jzB^FGpkY>f8f{{XLAy1;2!RY^GA{(m7+=&@|HS=*ku&#5>g^35rf
zo5<D3BxfT%PfzEX(JMpL&Q9~cVUm1`$PNmRPi%jmtyPXds8sc1pRHJti$!o$?qy{I
z-{r}y%U$xwq!7wDB=>CK56u3RocdU}^)?qF5HFz^9{3pjD$Gy?W&j<i2TtVn{{TLw
zqlPWquR)G|g<DoTaJ*-bf30aBGdU5-7%#ggXgL9UpIV0C26x&HV;J=UuCFAM%7Os*
zIq6YKnUiB-x%q${%~3Zg8-0)}L<16V0q^hp>Qc#Sk)dtU;5g%f=*RjG#;<u%!yla{
z)-Cr`_xJjC_o`19S!CSBn}N{C=c&(NI%cgaYt)O}s9TRVP2xQ8RAuBoi~clypp*r`
z$MD<6c>GUu??lYfJFz2Z0020`gVWemSzWMIMgbd`^uQFZI}eiOix~0&?iZ)$RBo=t
zqkN<^%rTzGJx};k<u4RIX5X0o)$RD7=lND-w(eyjSNV_3?0Cnp^ri_AJ;)>(0dMZ1
z>V3^IB)YiDENdEeW;oB#cBlD)7#O44y_tqtsf^PfnbT<OPu@RYN}|!FWFm-wUvqa*
zD`{d^RgeyzhPhZK3fbF$d;0!0zo*=@kfnByN}Je;CUmw(<jT`--6YCP4m&VBkMSPW
zJ%VL&PBZC7p=zE}H+*qg!8;t)PG0)h#MulAk(DE%9@UuUugd;gyi(<%CnOF^pYIQU
z=UKNP#U|W>2+FYN!?6`D^)R^9nc<K+9zp!-+&6!{C=bd`<n}d~&q?NUQ_NM%QjM}4
za1SD)X>dByd7GD>$E{bH>8WZsi6s93Hc2@(Bob8?Vogm85!S8F<+f@oRH6P}wMkzh
zpTCMpkjK2)f29U)D!J>LYqkYiVA*6#?^DGtd)8QXCapk7BC@E=*`Upps&X;rtY<4#
z<Vwj%xH6QE!T6^ulir;qd8k())$_BV6M{`STWO?0>qGU;9OWV9vr#isOJbN?H2Hy!
z-OVXD6rJf9@rqY5k!6KP-kwb}_^Xa50z<&!f_b1(ift6ojCiV&w>4%?l}ba_nnYNJ
zLsi3Nsq(c(qZO)&q!r6Z_!T-nYLc}|uZpfyXB`VgsxmpI2hBWlQqhwfwK~)TgHtKu
zrHaI&lnk0`oYaV<)RM^*U{mu*1_dWMtjGpxVd9<Mn#QRh8J=oYH4l1lp`fu~9x07T
z4-_+Yp&_I$DBD8xr>M<S8$v4J)S&TABZ@$9YC#jl_rW7I*Jv?9#2=Bph~~K%n{xr4
zYpBwPlE7qc@9phflo_O6r)Z3*fD8(2N<C_<avh)JQM`t_B{QlqC6Y{<s_oX1rZ~k`
zl5bKg7b8@N=Vlql^Q_6F`HtmW)tifez4KK8^G4=is(No9<eG-fVpfrts8k2GFf&Z^
z3miNH*@q;3eX7|=iP^9g>`wsx9cfYvwK-6MJ2Y4YPxGcSDC3f0%u)s&(R28ZYMJcC
z#APN$+tfbw1%3MTq-$wJq1j1R<g1_LPkOTrq<hPBkv{e~C)|$z09v9*<Fc9}k|QZ>
zVZmkjz53I{E}>>vTmi$V=V|;pkN&k-R0;B98R!oq>FrVZIk@v9A!Gv*MtI2e6b7SA
zOsqs)dvw9;`qc4&>SJZY2Fj7f4ND&7KvjuV{uksCM2iX9Lb8H!xej*c)Ou3TE;D8(
zRmmH_fP;+vdsNX#>&U_9Di2Tp08!~omNL+lc@&U-`kn#pPs<ef^O5(!=O674>q_=x
z-3G*s_onTmk`GnE{$iKQ*(vA)3><U^+LYu4e1<B1W9q##O)a<(58cjwosZ%Csrd+G
zj8P7A(;(Fww%X-^8!^s5n5%`38ZrSOoO8x&I&5KhJ5LzT=~zvg!4XRjmwc0``6OJi
z2arJ={#ZYSCG3j~SRW@Ok)EF4(x;64jTdxcFnu~7>-mbz)pu-FAZ`dZeiS_-+AV6*
zWD9%dNUEo}Y<&(ZpqgE}Sz(Nk&*lFB)~-WMQ!LMr!Icl?&Obq2P1)O{x}1Z62ewcD
z0A7k)HY&+oBQhvdMDh+cw$u4%`I?3m@^ovt{ULpzae`Q{^{K7dTHVtiG>l0dh{s>^
z=}ftdu8k$XLo*`&Y(K3Q*5TxA>F7Sdha;-Drg-$MY1pmwu21@<BzENRKT=QOTd^oj
zJYh&L7y}2`Rt~bs3qqWSor}xQJ4Xt?i6`~+tD5&c2!p(skh4u?b>)q&Lu}8fP(P(E
zodJd=W&|uFEuN*Zz#l?IJ57;%#0qhT^A0+ZkMYG^*lg_NE%$~oo_XZ{ILSX;R3&wO
z#*NzOu^R*n3Xi%zZM`$ztSD2vc5+8<!=^t<p$Rd72Mlq$J-xqL(6-BD9FvfEqB`zO
zv?WMBbR+^7qK@o;z)=h-xZKD~MyKTf<9mOH^d9wUHCbGMFgeR-pzeRCy(G$BKoA8)
zUzE2X4_|VB!nRAPi+u|7G9B{F127rdIKciD&u{5f^4?QTC<V6gsxSxB*j138a*c@l
zbMlf$C%Hf8pQSc(trCC>5AQeeAE$riLj-0?6M12njnBzaK*{Z&YO4*C&gC}~yOv?;
z_5Qu9UG1Jp_V?=Lx{q)1sOQ+Bp^898aK!%r5znun^rVK_iF(S9h}Uxux=#v#cl<kd
z&1TAy5JY>IcUIey$NVd!dqwiemn05wJJxaYT%VUEjsgx90QEi3wF6AXOGW^&Cx3UC
z?mv*=RLg8v9%fzh(Nt&r1!(=5iw(*Uo`m)Kf!pe7t8EzJGTq5yNorwpIK!4X*n{1-
zAJVln>vRMkynUAyEOy`!Ku|uFqi(1$Lvun`6K0*D+Moei4Fb&;4^vt2KTKAPQX1M8
zFrKG3s_3VdMF;6zoXA#2w37n|DBYF4K8C$r&^%;U3*tKsk`s<P<k421oXF>rIVZ{z
z7!XGU9@NEPYUR{sL=tjXe6Pn}LsWJZAZEFe1+ge>(mHOW1DccqX^lv%(fd%VgT-0c
zNv5)5nk_O&`cj&SSoEipni~c<42q4#NDmYh$e~Dr6kcl53{w?&sZ>&sNfRN+rpeNp
z4h=i4W`!NdcGOP0TQy{WDhZDqRU@L~RM1XpBG?rhN>-hR7D&ex7;svnk;xRu$sEvK
z%OuhRkxVa24-^>4vRaco(+g0i*12MECQ7v^=}{$2I#sgBWEE0n;MLX~)GnE(g<~<I
zC9~S2BNd|2;8j^7^sP~>6^fwYs&zT2wh-LZs;9MEdKyXSJrt8x6>untI#k;+SvFwE
zg7Z>=PDts+aZzzxatgAGP(&)qQe?5)c&M0Ttyt|)$yt&_d8V3CN>??kQ9}SUy(!^w
zNW;AnLK8gF5k@E{6p|_^a+;ZVs1l?N?@mK#lygYf6oIJk(>|3cU5K}57elvmo`cig
zyDb(4C^H!czH6J%?H)^j>|76VUFEb$k%9oO`cbmEtRtx-C{@6!lH^sX8mY=#r%F<1
zR8@-3>N<l^^85}*etJ{gGs|#2eQGBl3=Wy-Dm_h68wCV}C6B9fkD#irZVaC@F=Ofh
z&013#enw3!fk+@m%lOh|BD_)T;2i$|vHt+pt4$OVVWf@t0}UTh^&RR6(OmF@IqZ3>
zs)a^;vIk?I{{USHgB*}VP%Li62<T2h{{RX>4$Nhlmlzr69^Xop4$uie!#zh@m4d$`
zBmJCpswAqrSLHcn9XRD_?F!4W*Co3UdHg+UK@el|oa3=LBAl@W$N++R<ls;QDhl#k
zqjqoe)})Yd7@;J01CPLpn8mzxInHtpb4?M1<P(o>qww^kG*tpbZZVF7>FJN@(xy+G
zfT~gZW2fQ%6h1d#g;VtYpW#grVRrz#eF5YD0IgAXa_p=;t474~q;u_1%B-YIAAA+`
z<FWke>XQ7BLym-Zs3cH|LCKA~4l!C=LNBSAcHl-Oi{%7hkMrr%@~TlvvdmPTUyywd
zU*}e?zD9UlfQ&eP-pBm=)@_(?JjBOuC3^B0bovlMtR<|GymX0sU}tOsRo9$$&-oRb
zs-7?j+{jm_Q(E(H3UPvR7&k%d{xzGbL|Ig+<$){p<DPx0q_idvQ#VX+^sRw#!aReI
zdfJvkEG0(gYK(nwYn-xU6M2MTnbd90Ff+#<mALW`vX^Gc?hJV923XY7*t)xvtw~)%
zNY|A?$K_MYGCk}u927WH$3j(!{-1~ySsaVhAH_6o#QJt0){%?HeQvos<jSrGr#${<
zn!1QRS-l?Rf&^yeNEz*c?f6z@@+6XdqG9(S-N5aS>P`pNu13JPQgAnQ_X8aMAJVC7
zVPb(3k{z}c9Y{bhzf#2oX#Nv8{6so?he>h6k@I_d56ITf+11uP-!M~_J(ub4$N9x`
z8hnJ@Fj;qDh-Oj4Hhy9~he7nNwlTI9L?CgT_Fmu5rDIF6rCk}8vhBD9u&(EY1BDf*
zYQ|>?9~mUE^d6?MH93|!#B2VroE2Vnf$RAGxE-rf%$Va_bRfuo)^or=-TXWI{U~1*
zg)fHYz+iU4ZUY{yTR8fk=qka881QhzsUTx_{{UK`g~V8i6MJCio~EaeJALv#fY(ay
zbFm-G35h;#+d}@Hn5wvUEXcv%yN-%IGu!Z}XSzi@R0cnw_Vp%|Nb(K`KU@LqDjb%@
za=phm3m(LjAnrovpRG1KR6&p!w`OCEew8{~p00r7pev9lk}?#MOJly@=qaLEPGYP!
zq79whn*-_wDH;em6C#XpgOYzrkhu+&j41x|a(@r-6p}0K+!S(oXW)G)+zJ;E+qOV*
zqqsjVe-WSNDx)lL@Gaag_lHCH)`XBdj1~ZQ!0S+$OdP?GM^-13`BHA<L?dY<F_1Cd
zrCh1V>FG@l3uZ+>R2clnPvukX!fsKO{_y_*JWzzmp_#wC_|}v%oD9}V=chGd7iIOT
zOqPwLlOnOT)k7XisruH03{Em?rmtr6!gkiGO`{n^;AbGpz>pO1YJ)~qh&@5YYTj79
z1eP^MRn8kdct2Xl)Xl3RxI$Y3tSopGM7cFN;8Zgt^&{8-sNWqbLe&c&nyDKmFrXaM
zbInLk%TN}<q3Slt17f5`$*2PZQ^d6*M7&dlCfZ(<TH|9}`cu)eDl^HUPdKP*#&1SQ
z-6{zeAc~tbgk)2TG*T$w=A(7ZT`}dU3^>hI%_7QzGl~p&t3f%<Bd-*g<jN9oY0BJD
zGMs}=2B{-CGSC?`{kRk;#XA{l<VPKWc&SyYJYtk)sg05WlSs=*q>6c9M-(7L4z*H2
zR^&MqRmU~FO`=Aq4M`jv=7`rOuEifTrpc2T99T6cms5!8+iM2Qn<3eOQI)BmiijMV
z7OY5maZ^S)s7D5-PAb_=4<{8AoYJ|)M9;-%ZAq});i>&<27YPi-K#`^oOGlkB7si>
zG?>U3VAN-mR%A?yghoeN5oi^0NuF^}>BTc{TPB7kJTb*AQkdopG5F9p9@T2sPd84x
z4h=XbeF8e2W{rFbC}j;@DoPnR7_1E&NBce+Ndq5B=t6~D0bTdd>4If>AFWqn4stlD
zrXA{|$U1ZQRt)GvWXL>Y=}`Ga+m!>{ig``^{KM-?p_jHt)~4m80u~&Azg*H4OksdN
zqMl!9<P+>EqF_!6F!awft~7!;;zPj#t*GPg4t-RLhCU7pZ9neSg;vH^R{C+xR7D{(
zw;&J+@6YQ(mdWP?bs+Ih$>c~6)|wRw7}|dkR7miw!*}XNX~75wE5YlGe=N|E?ci=+
z{2H7B4o(lgX{0NV1@5HeQZl*fFnvcuR!H5!VV^^h{V59ik_hY16gY=5QKMooFvHMx
zs1+U~xdVmzXCB6{H<(W+Bh*v`6=8?idnw21{{ZW#aV<z{vbQU=cR1{6Wnkdv?;qBe
z4UzXtXQBT9Jbn~Pn~<o^dhI{cHKJM)-lso&!Ltcv&+zB5J&&>ebwf=iS(zWH!32&8
z91Q(&kHW5M=^Mnv@Z&jePyV%M=oc}`B*<h1i3Pnw4*vj_GwY1kH`_XX#m<Gc$sPbX
zWd%?9{{TGJUZxJxH;@M603O^5*I=J13KW8J2W<EJ>nB()%7sS^FdH0>D`{+F4~XUx
zR!f%1A2V>?r2ha))s{jmnj^+T4^znh01xL`Vkc>%a!x~V3C;&>dsW%oODeLj%JHcD
zNF($h*E}z&(>q-gDk{vFW0wFD2VuebedAEw%v)5k;PEVnk9Q;dDp30+A#uWw$PvwD
zeW3pUY1xsS1Z;!b=LfHT->q!)iJH}$;zF0FbE_bH^am$D{bBjhs!Bhz(aSk)#~kD4
zIXV0RC+ZfcJ4)7zaJb&eFfoi0O*dMeQ5~!Z2}aAauVaosK2Ok6zT!&l5oxC0Nj9Qj
zZrTS-V0Nyy*>Mncz){ft6~*YPk-DG<EZA%g)!f={Sb=l{9ICP44%w)pRej1=Ht^Wn
zmk3Pn^A;VwG3nI%^Ts;VRwv6MJ-@zTh6~Yu+5Z3^%hs>zsuI`DXZ`6T7&tut057j!
zO3JZyR@$Uzxg2NM4$J)N=B&{WcTDMRV^xflBc7~#(`F<l#UQhP5$XQ`*QK{oP!?gH
z%tc5@X8ADZ`_gA0m1}JbZ>dQvLlEppQ^_FUnzbB201yxdrfF3qke2fcj@<E|U@B<g
zz{v;OoC?iFT(>9GV+U>q^&L8M{{Ysgj0)U3@5ew+YSRoKnUF7h9MbMpIZ)63e+smS
zxm<={yBI<0asGcQVFZk^+}*G`eJW>^x9|n`-ZNEM8-C+3!R^!eP{9i7fOC!B=Rc)W
zkT@SJaC(wOT~&1>c_e!e@u=h^gSWS(A(`xj+YcE0Y2q}5<xU&hnv-B8vuE+D(iFkM
zlkc2V+^bx$p(OjH1L@fPDn*QE?^b+!cV`FltI<ebgZI5pf2~q?C94;qJ4oiQ5xz6t
zv!m;d^`#yP;<Sr1k~wW>PiZp409Pq28#g9%-n}CJ-^`4ITy_1Qmnk7>NXl{0#8I}9
z+XAMNHd1#CRcQzVn!@^=$qg(3$*S)t&sv7!Vmclwy2x18h}joB&w67V^HG9vPgA>!
z3P@DRHF6`;t0xR9<aIJbXq<GY+*81&wK#~33QXat#+uzIk|siyr>`{%`qZjxGIk%f
zowp*Kqau`y3Rlz=tVYN&P%x`JnvK_*cQocR=3G^LRoT}fs%<`%ab;TQz6TVlywtmB
z#szYtiLx>&F-pYdl}CECE8I{ikKUxH<jyG!s?uSwX}wPrj=5@dXhGwpZ5C2w@g^$V
zOM_6wR*`BZ5@exxq##q&QZVAMGHj<WT8VfyETEc+M>V7oaVAyCser(wMZl+#RM~8=
zDdMWL8n-B_j;AyrNRPcrl#3o}RE&dG^b<t!jEb<S#X}NeuST2-$%!HlD>T=K9M$hE
z3S+MnBCKZ>=cOjfu<3(LBADz&u(9H{w8%!`nF5N;jB|n3wzRtmCVcUV(vs9^S)FBs
zXs9rA?^nne;;LEMn4<xXD%*UkU5{gUqMXMSRP4@s)v1np)_mC+EBV$e-4mDja&d~U
zBX8(&RZYVw%VhlxL~fc!z>FyMV^Jn)7?w`(+?M<*MP|lJ06ot&RzG>O>^Q5DgN?up
z`*3PeM6q%P&|DtjxT{FA4ns%n$3LxCiC7%5V1L>*bS_sUllXsHxRD|jGW6WP_thXI
z@Ji>~nwJVk2k{*$Bd`O4N%aScWRDpGzCRj~TqyaAkHDI&z$os_dm4?OV9bQ5?~_)9
z(V!Vv{pRPfrnpm(u6XyXnYD{sY=@I>J$U2ssx9K1GRMoAN$k>sqw}hAk5MQ~p}V{X
z$C!BPGyWo!qk)AP9-XTh{6++U9t(+EyIZi31N7aKQ^Bji$8n#_W1b{Nk3Ud<oh=wz
zOLF*-uMHTD&tuTjnVFY6dFg?Qt@fCuLApd`BO!xl_4KOp%3{FW%!BxW`=k6Q*f%qE
zl_2g7hhO2(U#Iw0T`x|GYp=D%Bc;NqZ>b-`N1++P9{&JJxv1@)0t1F8rs7BPs`@;E
z5<<a*sQANo2a)yX^{lM9?rx;{4eg*ZXOZ0JkSjM)h%_EsbAykl_N(gZ(DfMy8SH&3
zjrki{vy!6#AM@M#)ih-nsmMO`c!>mNL(gN*Kb2`d-!|NUaIq7C{{Ve+`Tn%qm=)e2
zqiFlX^CbTODv_af0RI5fE>1Iy4nL)51#6p1SFsFi8tLDHk%9_?{5d%OMyTDu!u|jO
z$C&DI=uX^zXEmiAs;rlClZAIx>Cg<E{$$f`;SaUg<7;o@u_WjDkIJ=b=#?umw=xHY
z$^dvPxRNu|j-Rb$Y8M}Bl1UV>F2xWa;Z%G5D#oDh&WxWTlwQ5C2|v!MM)8Rx2YMt+
z<)OeOh~zKRIXUT?=bo(E7Dm34=C0p4$vNGy@CTqjLDst4R%TGo_nQRcx1g?POoB93
z50o3XF&{C`I{yGKO6l#vW*%Nu7mdy7=s(7)lv$lwCdn@C8fMD24i8bBd*k}{tcW9q
z8MdZdpIq)I)YjeI)Nw${#|4ft)Sqk`#)jf{n6WGde(J73`mySNPfFTp8A43lg-AeI
zOS$zp1OEW6L6MG7I%hw1dB@>Y((7_<JKG(8V}bR>O1F?j56vWTuW|+l^ZvBfsEt!C
zqe+316nDwT{{UL2QyE}eV|)Jq`s)F;+ZeYoodW0Y$luHV0M%9_)*z7;i}1raJFsX;
z4cyxTGCn|`&YJvn4Ufkap1OoB(cxk24OvT=vGQ$Sp=JJ6S`(<OM^MSn(Ek8hZ=TEd
zV$Z%np>CDe<YaDo33L3#96zf48#S9L3Fimlml^at8jeB<IYtM+YLg7T3o!SjW(>Gs
zK<({FqdHa#_esxegT+#~XWO{T_+qrCaS_KSzCr3MF6QzR{0_u_bW}*RK^*6K<EMJm
zf?z<+VLZFOU`mfdDp;p0j2!)HV(d91q>aHN=I7tNX+YTmxyw1rumo1^yYMS`vlfeT
zAZ0a?djxw<R<;9iRj#916xB6yP9pv|n2<QEJBYEIb6tJrlo^Qvv!6+0#@fP3+`-dr
zoP@9vPDM5=a>s#PY0w@+_iDmu8wLXNO&Tg}<VCAD0ePpjvqU+rz-ZoD{oZO8(b`tY
zUMkAvIR%Z2;|8xH6&%+3=x26z@ln2(#rPZwhjBQW2RNpXde*yX#CE7&#fji?LUtpO
zvoO<0z{#u9T9q6vKwx9kP~F7h%A+LlQ^um4w(XvkVmnqns#_9+BTSRUP32@}t;1)5
zk}6XHCjeBB*t0jwz^K|1dsfof5_(kWrvubfyO$aa=9<x%bg2Hs-`zFW8x;eALs%$Y
zm1^edk;=CjsMyn()GXZADDn3KX+h$nCz=5@Rw$G;HTe{Rk*Ee&ag$R<pyHdiH8ev9
z6w|Pl<j9z-C^1b%sW#CgOI&yz=^LdvM-+ugtgJ(cT1Cx9iOpLnii>S(i;5hyH8*xC
z3sPk<jyqIrl_5OR<22$Zj6~+A5rImhigxT%lPQa|VyA_LRu4+AwC1xBp~fmUrmUG1
z7N!JJ4AccqYIijsTC`CmyhWrOS7oGI5gOoQ-nor9D3ESn{=Ifq4e|i0K(2~P+LaSb
ze5=;0%O(X#bB;|@o$;P(<j$C)wDSAbbhBiY-Ph8sOxftds7yj0qOxq+A*`ycmXsX-
z01j!NXk6d`F-;*@^cnT11Oaj-!1m2UMWRKx4fluSD%4w#%%G3>6;y$k$&I12+}5OO
zxEbf@aYRr_mLYi+0b`$R)S;we3lOW=0h3U9>kz;g_a>9=MYvp(>(Eu1*t)@S&g1x0
zPO6K%ug%zh^{R{A>;$vt9kOv$C)6f<u-->}bg6X`DN}5{G|r#J$^Pf{tX*2@#;utf
zrgPu$AO5Oi-OTPhypz*CaY3+ViOh#{9QDO0%b!BrRCQ;bYnsf~kUYx}`H#xd63zY<
zhZd!JmdkfN#%?9%0IQ9I{(5@nu>AcirSV;iM^9;H2g!u)a5|Ho6n%PqGhASm?qo7a
zjm^#gz#f2m4wcfO7{>M`&9|XtH4CrexVY<*M5G$i`$nf6YEto~1Z7c)Vt*0O>scUS
zG7mKy-Oq0m#}uxp!vt=}y(seD%uUGIvGK&Vb00R{kx$-e<Y&*m)<4MCWn<#FC6qw3
zDRF@Ni8w#qAoRz7f2DZ!i!xowbjOIkNP+$-I*-70{Jm)vo+(U<?622wJpOf3rk2FU
zT>GByHRzT<G7<pJ8FR{?>J2gr*vy5Mwy^^seRG_8`q!QKW5n+Bt)1%(kU~m&pJQEF
z-30rYxxos<qYk(};QnH=wxIMn$gcr;qX!3YU>E3rtw$uwG{{4Ldk5Q$*F9&bI+&E?
z$XE=4+ZgBbHKA`hNopKFAd$yjrz81QGBz(NDV+mjDuiR->q?-4p}&ST>EE1V^%Th5
zJ4G^_=Q;Y~ts)k=X&eoO0*rS%s8I&NZG&3KiSutevFV<1RIcIQZU(}qYXOgKr~GTC
zir5F=)HXjtYK`<NA1np|VUMV-B^?DB>T(E-Rhm$Ga8D$1SM?^Ng2ZllW!&4@<T>m!
z$JZwnywL5Dvw^+F3I70iW~oa4TE>HsjifiOFu%_=k0GzLELr9+owk-F@t%Mjb^ic9
zm8JcpakX0q9P^xig>usB2Vv!i3xSTLo&`a7sXHqwouR%^0psiXev~aXA?kEDdZb_#
z8xS`Hmcc*Q@~rgMp-_JA#Pu8xObX>DpD5QC3{o{8jz7=;0A9I0TV0y?`A`!Ly6^xz
zx4AUbT$s_`-~2$fv|@?@W&BC{dJ5;SJZEu!a*(D!+gzM|Pxx08C;B=wj57hBQJT=P
zz9^xwv@jU}4!wBvtyMK^h0=CL*wpXT86q&T&niPB{y^aVAC)#Q6nShK>raV*>d06V
z_2e3%bkR#YG-Zk$U>a<V<|79In2ZKJe?IjNFujXaBzgvk;)^FmF-ssr{4CpmKZwY#
zy5CTaKnTFy$YeOr(2DU53f<(AL>a&*73dmQ+HBxcCS-w+%zL5rALp9Nad3J)3(`vO
zbI>BVE`Dd;eOLm2I<S%|@JUnc#d2`!h9>g&41vmmKQZbLTGEG5-bPO!m1ob~9W$U?
zkN`Pj?~Kwtz-)|j^sMOT7#v`GQ;dL~G70U_)uT3vWt%I8WeR;y@)bhrV5bBr<BSTO
z{Ikgl8`LN{s2j;DRmdFiniwM<L!6dyp7k7e3BPPxn;rXBo7*h88wcf3EsvD>2NgCc
zvqHhoaa;D%aacC-F~G^iYukdV25Q!XGg%uVm6Q=x<9?jglsT;vGia$Q3V;an&MNJ~
z{JE@Y+@l;CndoaxCSL^yKq_QKJmRsE=LeHklK27Evgb8)%@Yc@Y78g`097@;2OQMN
zKI7J_oe<#1Gzv!oqHARXrCHt$E;CNzv7Xk7dsHuA2*xX4U>wsJQ?O`ZzM4HMvuR+S
zoL5D#R8jy1CV@pW5BoVl=}ucUQ_{3pQyD<(P|XxkPstrIQlttg%zY^VIp(J2@}hM`
z>?%PPYADW4T2`in)rJVb>qzOIf~Np=G#p}?Cz?G%%bKvlm=~JC9tt4<(uL-y37TIo
zj+LQEgS`Z1rZHlTqNR<ZB~MzIB>+^YxuhhCA?brtgv~Kmik>nkTACyqKop<~QfL(H
zQ$#YZ4MKWVw_U=kNSLOHv0cqZUWqa)A(BlnW4C0~sA-D4Q@T(nlWxT@y)$(tY1pyF
z=vgvqCp1P5X;?+nb5d@|sE~0_98$0WCWQOcq*BCyxaOKeH+0sCO63DLrFC}93UHf6
zU}-QQP;r{*)PREn71<SNnn4Ve@G6Y0oc60S=BUir2Njf=)f7{>BeAMYa8sZSk7{+y
zrzBQn%QU<rZ62Ad+>O#Lo0!~u+f@4sjhr#aEC-;fkyuN(ZgIEQoSLyDk=ud>N$*2p
zVnw;wK*`VhMy^A5BLR%#@fFTJ@M(bB<Y%sUtAB1g1~Lb`0aBRUO}Dy)01GL0_s{aD
zZWCdA!}Q1?ezlO7U<~j1h{vF$^GlvgKsQoY3LLIA8yg17S(tfp05QV!AXT{Z`2ylc
zEV%42b6NUz>>-GbNSqF&Fh9>Vx915HK3nz1Mn9qEw1gwMoaZH>O>E+fWt1*>z!*Q)
zl-nqTq+z)ne7O~1tcQg2zJP820QJ&{Sd+Ye-)zLzv5H$5%1xeg;|oJJmvd^X^93i4
zK|ZzS)&Na!f-{bz(!H<579$0NDb7LL85#co>(`1aO{ME|LozgI>~WEding1JE;^UI
zfD<Hz_RUe27HM`U%Q4yqsjc|qNfTi!mN^89%fEt4GnFIfQUUt)`c@I%<@p#bx9yT5
z5MqNUr~Ax#X8cd-igatcig%RG<N?V!HKA>4Vvo#`K74$De>&)t?nGhAjF}Y2F6AXy
z9FJpOf#LmE`%=L-ureUp!1V|G^IRq2z$Lgfr{S4e%K99!Rhxz${i$-^Hyhr^a$s+k
zNdExV;DSN*&-AO1lOrT*avAaO*R43h&uqa)BqL$K>T%cdH7daH%+3J$co;e72m1d2
zFM5MEYFo1xK%+l4Fg-E<0M@N&NGbw=c7S;4(?8a;t>f7d$>kJh9CpQPTSu?}y~fk+
z$NvCaRisNcAOp%3LjEGfu({4gJ#o!wqHO?n0~HkMxer{9mAj4Aj8v@7UhJrpf;cL~
z{$L;HpTe*;Yp*5U2nAPw-FfJLophRvUo+$dWys^d9Y^7c<!@p0%*wfPBT@kFa6$a|
zHO|(i>6s+DI5Hl0=j)D#`qLst@=`}5=I!m%HELKHBs`J{+A-<Wf1hfjt6Vjefs+Gv
zz@z=|TAN&FHLvPIYlCSEw$=OiJbf!C%i6zpCZn|*=56OKxRQNqXh5$9^%-UR!mX}d
z2APF*88+^Hvr}0_O}u-Cc&Vid6@dWmQYrSx8CEsN$^fS2?pQfrJ<BU^l^ahK=8UK~
z908tccwr!9gMrO9@f_`rCH>^2Dvx@`I?Bb^*6@v}ds${HoW2L=UV<z(TPE0x4?;hY
ztep=@`xT|7n2nA{nArV49{&K1b}btuDjp%XjynGUpU$+Fq~6CrCX!~iA+5=H@=)aT
z?lbz2#;QiL+qTmaf#`ExtWn0u3}7pEAnvMn)^cH=%ai`uw?CH^&nkD+)-`%F5)0;u
zx!f0_$m8{`IH!fvK48J>0jyidp=|#7-NESV)AOo=S)l+J!CuE5D?7NFSxD$E1X<jG
za5%vtsI310H&OWWRK?_F2xsJf4QJh6`AkBo+x_5r)n4TCSn5UDDgb3T_Ub9c%j1!X
z;$zh%5`3Y7&wAOm)FlHTfTy)PiNNXY<U!T{01D8K;X5#{Ij-g>f}ncU$S$08U`;-B
ziS}n)lY@?NR^pR5!Oe3v)5zE)*43<;12ie!icqpNiQK%_KA|4pm@R9m$f}oU#Z7Ix
zA}-A49!wKan}R#lo5TT+t!KB=xn`_(#oZSox;Yhk*7Q8GN6us0pTuUc)rhCVKbLy{
z0PV&P<VgOtnrw1N>tdaF$*W5!Ju91r@DeiBqYdJL*0hYwr2}7Zqpebo<99i!#GF>D
zMou;)ViZ+o1-_Lp-ZbsG6<4^nDKhQmp=PVHpv6^a@sfH}G-(!B$flE;M<aFx3XI)r
zJDQ3g9V$hX)P7zwnt)u!(#aa+w0Ozw%`HPh_u39|nw1}QXDjoXu^gQBp-3D~A%!#!
zY0P=07^z3jp}<suMk%XLrcoOTW-3+`&^H=s7h$ToH9DpR2)H#NIHb^+gNjvb8g6N&
zu|g^(kigRCny<f#a=v(`#@S>rsHFoGA*T_MOh$Z>^H3GRtMac}sLRa+whO?flTQ^J
z^`J?MjwvxfJYtjzcP`=;ry#+lq{tLpo0F;HotlAOqz02HaeQ%4f^Cj7(x856V;MbZ
zp`EUiDzXd;-BZU}<g{@&9hSOivKNzGN?IF0S&tQ&aCkMPCrZt^Qgd0z>4A?q<nzT+
z^J5Q;oc${F>N0rEQe^~?7n;Gdp&)69UKp|VtofvA81CSD=cQSe7TSJoze=RAgC-hK
z2Yi|?Qb=n6x%solUA0s^JF;aZhgN@-Rm+=D>Ohf*_7$3_ut|l4wmnTDrfEcrAlriD
zsl{kP8X)<$?mxmYRkY_bN1dBfIL9B(uDn}vcLm4rf-1RmGji6&_-05~a4WZ=1$q4V
zt8v}L;9y{S@%}Z6hTteMgd@IlpVFd{h}>DQ-|UXG?$E5B=Tsz*IpqQVB1RAEN`@&p
zGVlKC<AGdgx|J9on;%}adI_W`Tgiz|da&#M6;Z6VWXhJ8M?rTCbHTIBa0tN#xUL&n
z@Z2}?Nerl~F#D1Y59eKNwxMn$2;w!Aj;+W)n54V4iuJJ3#;yJ3;<o1Mh>g~a@!i)t
zEzyGdAbG;e&@lc$`&9Q+!E(Dm7$Z5Zt6uP(%&JUeOoWry<~i-3O5=xzFW|d)f;2Jh
z$Z-5;>5Nd}8EY3}(`ol0hT7aozdWz47n5zvoMC#L^Hzt2^&PGLoUAdD6*>Ist)GQ2
zuWq)ryrM4VLz2hX4k;*Ed(3ZFqA&*Q`Mk}i%Lli5x{D-taJIs}TaGb`^qn)nQzCD0
z?m*~!W80vl)O<Z00hPe{dSjZ?om$BWC4E`Frb3rC@HoZ`1O9#MMXUo#5f=yn+CQ28
zL-VRW7))BK1pqI;4r`-pb}1|LE&`5v`hFepQx|k&HD+W-0clF|WBbJ8(||u5R>VjZ
z!Ny4(aon1U-U5uf7a(ISPY3*u>sm2F0rRoA94Z6ew1jSFCt_d;L+wnJOAnXUtg7}H
z&$ThvBn8K){OhNoiO%Y_%_JwwVhQAW<X1jyf|Ybo2R%J92nX=UuF~!yyBl+p^DnXD
zvE2%$)+8`IoOR}~oK@~=7VL8umS8fs91P(1=LhsP#dy+hDdL12jpdskPBZz}rd~&w
zU5bu6;AXt5#c@66s8T*zNmUK`;8t<m+9>51m~I4%_fH15EN4uDa6uf@(Rg<Ct(ERA
zK#Y#R=cQJcN|wkMSHxgshXnEU6;o<^GUFDI+lxhz{_7a`r`cK%!*TD<Dzq0a#{dQ&
z?6Cg0t@}MUR=bHjojmQ&-*d<5SvXH()R7Fa`4POVM%3q%(zZM;qCLgu_NJnRVL(J`
z!>Q|xdY?|i>smU0gOUQb+O(oc17#sh4Dfkg!=C(~rFKx;wb_LQx<zXk{nScE2d;ho
zwA54;-sIl?gqm%y&esz-4&387z^!EkRC#TmsOwRMjBs8+20l=Fei)|QU114{6|>Ng
z*&f{u9;R<sp=TjfW);cwQ%rY^lA9N9IRt-P)*PD5YQ<zov)Jt*{{Tv7_KQk$9BQN8
zjb`SHP-V-%vkY%=8~wgNBV5+1>~rNS@y2){{x!1Kg~<dls<KHT$k`E7{obSN>r?R-
zpDD?-vJRQgq0KS;$-pvYaay-=6Cuj&Ur;(zVzqL9f1FhE*uG;jC`Rxy3H2haqDvpk
z#tmEk%SHL)-IM$|rx+PHagWC|IE_s*7()}uKSNq}-zyFQsNl6nAeSbzp|p$vf@?c5
zwp+GfK;sp$ZO~R^5QQKBO>00f2NiKfb7q81#X3M8X_1kNbMh;pY+=;H)Gfk-O6IU8
zkSQX%d&MUsHG_9(GDxhdF2`Lb+|Fx^bQKUK<qlf}D*!!v{{TvRz}fVw^2SwiSoIw%
z4Vk4$D8b9IVJ>#@n!63=I@V(ZjGPRSO%k?q(xIyw=+V-_eoso$g6DQJYmv9OImK$*
zTpW&QQfP@l>LZXfOdB<m1;#+;t_g<GgIy4v&S^w^%mq@AvsKJztw!Mi=M_&-PQ_$|
zV<wTCZvvey!m8j^Q2Whh?2Q%J>15j*HhK!7Zy(Cd+@4KOdck5?!R3yBDs&(vs*Gfk
zO=R8Y%45%Z$#x?7fvF>Nnq!lXN@_PWo`+rTb0l*{DpIt~rl~XLlNCDDpi>b@H9k11
zD20Y;SLLY!(iG=3(P@Iv)V&83#%u~15@IuAp)Ho7nuHlSsku>l5)PEB!KVeM1*y0;
zJsO{%T8kJo@N-s}l!nPrGS!9|6%iB(vb%9n9%^?L7Ac7_gRMPi3(Y(&OJLa2YBtEJ
zSW|Y;n3E$`Ak^Ebfhp>=+cdaltXr!Y1{RnUdRB#u5^n`insC*Wjm<o=Mh4s;O5C?s
zU<*|%3t0++%Zk}Zwy!nVqI5;*K_Y-FKI6S<OVX&wh0Xvbvyr?EYlPs|WUN)Nqb=A0
zT`O9qGa(rN06prQ^TltKf;G>tPyW4U$<@0XDRwZ$t;>SUqA}g1?N;E^?GTN}&5-v4
zfkR0&@AHv>dJdIhIT=RO{Do4CNtBsZ@l^p-U5Yxf>sgNg19t7J>PIK&D`HsZk+KF&
zQ<}zh<(RkV1xtt~Lu%0(Lgxq3m=^y4>sG{3B#ZK*f!vk<02<D;v1A0HP)X$N=ku*4
zgJNT0^(+rR{c34D1e(5sp58JK%vawaR&CX@FWr;PPeK95=gnD__8fQgZfYX$FIG$r
z+2iswuC^VznDgQ?N0eDlU4y6iQ>~^%I1);@=MTsp;-$FL_c3HKsq``xAC*^FriH#;
zqz-eOkL6P0*=k7Hj!`NuBu&5G3<~}$#a)MhIp5zX*&jj<4l54a4x@56bA>;xZCa=?
zgD;fn*LjZwenyD4f>vgNmh#8SnLWdDY6)!PowFny0gc$l<=(79k-ss_G2aq#{EbeL
zV*!G>J<E^eD_h@lEmp-8(rpL@qsURv@CVkdT3EpwtE6fMFuZlCj1f2=B9Wd0o+`qq
zg>uIS`^+#ZqLDIeCJ(m=pbu8y53O=q+)We`?Q{;vLh*nz>0M0Hhb*eH9CAP$oc{nX
z=Um3VjDRx*M<?YC^JD)2uSDf-Q0XhX9LI)rTZnFan}oxR<Yy-!<AL70=w|bQX;(Pv
zy;O0}<^DD24HuQFd4Yz~!*em<XE@2PL(yDa+AM>iJ4wa>$7KWFrkib=){08y#|<e`
zRN=ZH<B#e80O}p<V%`v{tj?r?fUD3`Ev-~dxULU>{=I6Vxg4lb=nZz$gT2lf$!t|H
zjiii#Mm|;P{uN$GnTX6l0pE5AAJ?^Oh{kh_nubXBp#T7Vx>n5@+{cm}kD0?292{~*
zVBgwHzk0D?pEd_OhI#yPkF9mmh6(d-+wI3nrz~;1zXT41<A7^9#`+YcV};Z&QV3li
z10D0uJw<R@d+GY7vQhKr$1j+J2MwHeuA|0DEO#>72T0>Qu1ExDx9V$~()0<&qXpf(
z?Gh`aH_P)f;1a*gR}+^jcRS$=#^<P8>5w&=$08%654^RFci~89-jBCyAD52R(AxQt
zMZB<R*l^%rA96b%TCj=omCCUrIQ04-N_sV^joNx1dOjaoLn6r=@z)(k<y%(TNsN5c
zZQIZV?Oh|=xKpqJ><<+}Xj?x!V2+BTH7&K&(MseIX|N0vnKArbe@@k8m5l9obWVVP
zf2C8{p-W}=XNLf&AC_u2ymVYFtfX`Ij|bP9j&z60*=~E|Z^B;64nF9{e!n+0LTy&<
zl<tvFAI22%?nO|V_1I(?r6lB&$fzL1&Ige3pYLp758^5vM$1r0oxJOpAL(!-9&sCU
z`kJ#In%omM?_JGJiZqWPh}WKTCTec`g91K-0+oW~av*`fz4XDV&e_f)LG8#jJ;afy
z&fa}RQI~rRuHRFUO<fbC7jOstTYG|YR$#-oF_1fPP{k2oH*O!TU5z#<JMqtTtr9bA
zl-QY<jIYwE%ce+;#F69uyo$Qlep2wvarsqv^$8@9f-(u{+=^|*Ek_P+gQC-?z$0n@
z0Ca;_qPCGf@IRGFZ45GC`C=pLr;5;yCya3Am)A9)B6}J+GB_dw9AMSx(S;&29{&KP
zSw?R7{Hl^hQeeG0Q;8VsPvnY)_jW&qD^x)Y=owB0VqM0{PXG^dT9Y_A1&I7>F6u?;
z)}nSdr*kG?xjAw46;Xng3NlGJ8LP9EBZ19Om0NIO)EtWEZ0K?3i%dz_#6s;Ez~t3e
zl#|U_yKguNSAn^Q_5QU<hB}(fu)|B5nq<*Qx20W%=z=RKG0z6Az@?Z}GohoPww$&(
zt&4XU9<_;M9x<BeEmd$ZD@tcQOA~Hva4Hvsu*v4H1ny{L7~;B8Ir*$($!t$2ii>Ge
zfm*U6fyt^9l~rc!22DdqwJzpu%Wj>IWy1deQ&$moz^Xdqn#@egARpdr{(RN*BZFM@
z+H!I^@ZR*LF|tN_lTo69-iLBeCYWRk(wwY!#KlpQOShVGigi)*19tOLfyFaC(}PnT
z#oJ9`QZHIY9MN*FLPlyC22EPqQMzK2V%)2?o{D`cK$$f#kBXau8jI^s#wp=2DH@%I
z*up7>pPEk8$he^7Q(0*`Q-SPgY%V~{QI)F>DiS%Ra7=2gDHPx_N<~e<vMf`!(t*u8
z6z&LUT5^C-N^lvZAc82k`HOb;rbh%<Tcz7ZiZTXkl(Uumzy)=>Xpn$NuG~6C(04YI
z0ALDJlSrHnY5DZ7rxT$vyT`^&Rh9OPD65ilRV4vOtzkV4Vr5*Ha(5cfkvZozr#?wF
zLS8<#&i6W_u@Y@&2(je*R-`s*vo3b~R#3Z&*RL(tinSi0B*<GHT6&(QwTo6Wnr4ZL
z75k$<K~TvK@$%-VwxGQ?6N;@iv=Czfi0ztANJ<R4@Hju~a0gy<QF+Rz3QUK)Fc=k;
zHmqk)-V&;MDxRjKhG$h*3aC9v$sa-6>q3&~Hx{=o&lF?tGJ=2E#b&cd9Dr<<kF0G|
zNpa>j)hbEu&5~-Ywz9ug0DUTF<T&}4qkDEhyJtXpi6c||#YZW%Kv!%|2=v-V<Z6<~
zD&<T3qp%sr>zbKW-;hBAkC<?Otxq#5%ehuTvQ75QBre~=*dYG^Lsx9?rjA0=M$td_
z&|y#4wKd^T2;9gUu=&U2aZ*}AavKP+%Nq6Cb~qnHRFiB?*&A?3v#w-;+!2z&Px9up
z;fZ!`jvSyF#=>fafg8fKyAM#TtAqJ-{HtA20bQ~#@Oc6?{zQXaF{Zkl@|(E$6gNwD
z8S1H({(}^$19`+nZWQO1FO&UG@~bgMq=3;kKAFemnz<U|jqDe$G5-M9t$mfu{i8)A
z45=FgK)#0^{{SEUy>mL9gtsy=Wf%i2$G@<y<I4;VLw!iaLe?nc78wBK1KiY9px%hO
zb*%1rn`lxA0(l|YM&>?;r>X2X$I#btq-pWmTgD<kBCZD{n$SxnfHSrO9+>Js&-JG%
z-^X0{9+gz6sOrM4XFU--ah;%U<Iq(5Mmy7D91QS%>BQqAx{^7=oM-f=mCjGSJ3RZ*
zkVbpZ12$ym12szKA^{;XGti#3d>kAaWXrt>9B03!4GQ|6apL<p<iE5GjmRv{6FJ&C
zdUMwta6Re^3rL_bvSbBZNw{YVjA!uvRo%QX%yvqFn>jcnbgEHksSw=~obK*>dRHA<
zO}m|x>ZKbpyf8}`3=hr@9Y=HOJ!)Wqn`kI|l_Lxfw|dhSVPe4<0AL(;^dhQHY~U3H
zNHf4;mS3l>OQ&Q?nrx!#th}qE^dLBI(}Psaq-I>ROSE(*7ay%^q;CHJD?2_qf(Qff
zAk;S!TEvGUHa*E80r?7_W|}YBTSE%-WMHUeBeL`Qj8+}Z&&*4bn|KPqbNbfR_3e{x
z21Ll`<Qwt$=DGHk2@82~?iu+=kFVxUVNwYj!dD^_yD=^z+wGD7BSGB0e+sn?(1i#O
zFl=<{KOSfd1YvW4e)b~+^{HCbCs4-YVCj`l{=G$?LAJV(O|3{ekO1j~$)x*hTmV{L
zdw%Q+c(I8IBIy|WuQde52p21~x3cD)rPP+!LgJuh94mU{QZg<|v<K>=kLOv3S#$s@
zC)aIHYpq8q#DD|rF;{d$i`>zOsN)1xo#>+y9zAnVE5|fs#OVBTGlN-^T{XLvP)0ou
z_|w=*H!s|^yn_hjei_X^($NkNB-KlMnP*{<f$jxt+Q%9GK7O^Fnv*q4l@_3YE-L(J
zM@o(efWhGV=C2?CbBd&l<i;sdaLa@0Ds>q5r!EM^9#%O9tcC%xO>zjQ5&iBDtwAVp
z&}3Di6mHNrLs*wsLPkw%-8lq;F<94akVqWXRSueE2^>{dk%cA9?1b=YnZHcn^{!^l
zsG@bx*;xnhoc{nytm%`<s}}P4ti)p*NamOqc*zwDvCoC~d7;^nB8}Wug|OzTh`=>w
z62OcKmX6DtwqvI?w{E};3g_(|xZu{bEax0m!Pw`hu4@TY4z(N3PYX<n7CmZV2wt_&
zn`b_Vr6F>1O=oHn6^KXO&OZv;byM3l&*~C+my7`7F|>V6Wm)pUvnuoDru@cQM$a)S
zKIp4bLm=jxVB#nXW0pUlsaof8<W$j$P%jM^8fd2Rw^N){TY_~JZd;Lz5mjWB9Agwo
zBd#pWLr$iFiW#V9%>gv3Y9e{3FG>><Wa!+~olQWdrc=cY*ro(h70o{x6y;tkE@2`i
zOjLn!PB;}CJr@SWW5qy8r7cmGtrO5qifGL=nkk5Ob3hb;Q^2WVVXnCq5ngJ+=~1w$
zxDn!tQ%VH_8BPL_ie8mKE-^sh@k*-Ba6dY+8l-q6^Xh+~tBhM6i+XfTxEU9(LEP6{
zW}qM#$F*}-R}H)cUcLL8=<PQZ73k5jS{)S8rmMzDr)RZ7q=D9?L5@vm-PzKWkl8pD
zN^FzTt;*F(QciJL)HI1sPzFz0&7Zl(D^1fqd8!vdRCD+W=aM&rG21?FasL3<soFvl
zF5PL8FiuZX*i}hx8Oa;S1EQ#?S{l7XmUIWH`eLg{;fKq)v)cetwC|2sf%F{HWVsmH
zpnXTnOO=nB#aEU{<PYVHk8GdA{A#TC^Bz9NC2q^+F#blYO+MmDU_Px?9##VjFF)^M
zsb~$S(4GxK^p(x6sxdqT-+}e#n!LKCgaGizjB|x6lkM|o6^<qHqcTY*UHBPefGJ?P
zNgw77Cq0Q3uIS0Gr$G*|TjlZzP&$&wAC785TI7QAvS9ESk^1vl@d#wb5p(WIKhCBR
zN~aL{N>5^o{c3E*bZCn!jla2V*d0{l{)hSsk|{3Xak@2Q{@hqUf#Wq+*3$7xma~RI
z{pDs-e=qQ@Yc#_y=q_MAep!nt{KaVq>vTk<tZCY}nHsdB9!K10@t@Ba{3~wOD9mAe
zwm9Gnar)I8aUQ@-vc|`|XCI|#!^(m7g(v-x!~83@3%ScCT8bc}0XX!+{{TT#BF3C5
zs{I8(n~w?!^(~%%TC%D}0So-ARA+JFPf^FzRDfr%CXqlLs!ZTx`eK-dI2(F#RLUyw
zIu0u7IrOTMow*D15zZ(YB#Dp84l(WhY2^H)j`-%Hjkp-e_Z-x!+q>{-j8MgUWKo`&
z<Qk1+`95QVj!6{$Ks_;v02`79diqmj&dzwxr640ahBKd9S0}A8GOUpnWdX7Q&rD{l
zMhXqQVDzZaWx@2TN(cDnronN$K^Y^GIjXZqw>jK$I)E}i1J<puP<;<-ts!ijFvsVL
znJosHP?-q%2I2nFWBH1uaD<~0k;y0UsrgiXA!|xhSrcg}Pq%8$xsG6jBE+gYm1g|;
zsBtMs=CueQmOw62Hh#Mz5-|RPs5}lMD->qlA^qWiKRzpF?&HZ|B)E}D>aj!e*E^}r
z1c!%tTy;=b5%mCpT$K_!VA~@r&leC%dSs|y%ZyY^FJf$?06P8PL8=kYEMw%2K_{e{
zKdIub#XRHYHvRc3I(}8pYn$G~n(E~9Ap_sH@~oXg?b%P3L;8T3tMc6!f89t>{{VHc
zr}L`LWRPLqhtTG!CPy+bHkNJ1IRVd5D%=8C$Y5Bn(zOMwilBj!*NmD$f%26fT9*?=
zG4R|Cxfuj?+t#f4o^SyC$KI<df__EM(W^$%4dfODPoWhW>{Zb<q**vCk9yg+w3h&?
z<WkyM0s#PiCbeR>WnMV?RYo1m<uW^XnQ^=kR@DwOx}2q#hU8R=z>WoIk&|~L8<g>a
zJ*q{U1D+{a1~}rQJC}|Ksu4Ee<F!_qunAVd6c*ZeHCE-e<=FhBe8c+FPTPafrFSYy
z3OU-{jbPl$@Ud6qdivG7=O7#$spxAKTa4ru8SV{bHks2)RdhO8quZPVRL_uKB?Tia
za&WD+y$RscO}j7Nu6E9-p}~NaJeCX2NXB!Dvm|Pgx|56^K9y21sKJ;Lpz~K{i_c}1
zKnEL09@Q7soUv*cmY>=}AWZblU4;V>cGPfN8qkUrRpii>&Zi>G1r&4@r38TE7_7uZ
zQPi5d0Ry0`S{X}ISsdb=Mpvy=S#Ubj{?MRuYh=tC9L=y2M*w8kEj*ju-}}s3x34cF
z`#ghi+*I_gcwwbLdJ;4JYnu}8)KSNYyOM#k9k`C#A^br`ezhAajw!NSGv^2Js_nd<
zwWaK=k4iDJu=x}aamlJ|LLZn?$R!6M_^70EMh{w+Xo|1FsE=B(SFJ$es>jT<Ct7e6
zgQYoYY)V9mIH?694l-$b)JUrmuQc<(rXiu;vq%OzR4m+5$n^=yrneP^CZjFQDa}TD
z*2xvhKXpA#L7IA+VPeA5#Y8FK@k+vDjYdT&ZYiYHIFcb0P)z{PkXQ(%3*7K2xyL4&
z%v-6ZwA@6EARI6M09uhrEs|<6Ql@i^R;k>yjcq-~;l*_pvJIlSdlbnP(^;kzU~^rx
zo3YbP7u4i&QUifaRUmUpYik>j$(H(6ck}|SFgWI`NZWq3gpp~L<-#vLYcAk!B%0D-
zTO^(;q_O8Gpsbp?q+N_jlLVgiojbbloO{;n&~@UdOKYA;{3||IYNU>SM3Av<z*T8u
zY<>17wx)n5Ey?t%uLB>Lt~~`DrE?hp92_s>nt6fSt~2UuL2a3jPpw#30*r0RrlL!z
zVZD)d$wKGy6+Bvb7%zao-R)ZeG1Wj6;U?3F)1OmRp<-P^D@dh6Jh*`azRo{0TTuDf
z`JGefNA#@afBER!kEL6NQ=G1G>~UJhLn%9#gGldzb0&QOAIh~XCcO*><SYK#;QnH=
z<C<K8B@5dNT2|6$hGI{t>-h@W4^t^@Y$UQV{{T{l)PtIxWgz_9RDYjSRAP-A7!99N
z4O&Q}L6%oz?l}Exp&2sbKH-HU=nj9CSd2F8<d4G@RU`ug?t}0C6=fI^@{k9wTG0|&
z4bM5vJ9ao3s=_R=VpyKTH8$`UAJ&+KcK~%Col?4+V=v0PTY^d8eQH^N*|-)S+mAJq
z75@OAJ6p?zXwM)LPBX;~l@w@2GRCMk<%rJ}WIrbZ4Owq+nJx(GdG$Z%G@|}8+XG<Z
z1N{0@F5Qh?!;Ih@@I@ivfL<|yS&e*~NMZau`ucw?(`VG<NmVzE#EdtmPQL!s$+I-_
z6q0GKz^Uf|=cQvdw+EO|x~T&M5srHC?N<`+In(59D|7UrG!89pTY#uydC#?41fKb<
zNbixHd2%Zp^vTcV+O(mNIL1`{C?S^H1P+x1fCJFxl37e=j&s(aYmmbneL>AJ5zOi`
zHn1bt{{XL4C6JN#BxOFb59v}|OOSBOk^R^I02<4Zbn4m2?~s3$Ln#FC+DyMPu7BPD
zkHWI9ba+@2kh>@SGmtp{0KQKn@T>Dn9l~LVf6$NXSyMc4xdmk&gti59O(R&gWZhcY
z+qohs&O44jPPIl!EyCn1tLRLdf2Cc#xO0#bgWNH!8SN#KLdCKP?_BLAvAk~gCz9z<
z5)s&Y$_+vNr4hjLi1g(Clt*l;dh_qiOt#Um;YNKiQ0`UYh9;y{zE!~W=B=#Wcm<F*
z<Z(=n!!7cuU#B9pEv#HN9T(V&T9tG~6}p$miyq@5wyiDLsLvmcYHLeL7a3fN)`lNc
zIir^Phd5fnB4CFER+M?~!KkB9oRP&?R1M12Bxc>k9S<3%GJkfJzF)k!q}(ao1oH^%
z`P6S99MjvL)f1^8jC~CV2&8bsKdn`gP0Ce|IN;NsMumuF;P({-_;uWeBhrG>GUbR8
z@g7vyKYe7;Ny~0u;bGpjFXm{8bR-aZ8iL;T3-ZYL^FN7CM)$5di;?ND({x`l);&z9
z?YHrtrC5m`E(yr=G+kS@teF`xlfm?<VUx^WzcBqpW}Wss<25MrErpUKXdHpG?anI1
zFtjS81gr85N6GwYu*mByJDrNJVtDj4uc|mPe&t9^x&)ntJS^SCXhac*O1T}pYymY%
zT<mhOmlH@2O3;UXI%2LY5e|0L!X+SJ0al><nYopcGTa(_TLu^{im(#~hF*GBi5c9$
z)Zthq3XFMNx8iCGNDR|7hwiTf>rT|9-FV|Y4nHcpVHp~HXBaQ@etkbm=1ZQd?2amb
zysqT>5VDw$12r2G-npyJgbqzWRH(=wjcnuC^puK{BV&rEDP?X}uE@lm4M4*=$<0eN
zMP1pbo2@iTDOH0S-Nwl9oCgJ`W4$3@yii*;nVNwn{0vdLq1}_kNUBJr;u47}H4c3#
zm^q|wl@gAG@{BywPZVdGTvkz9f<sL+6(Lhewz`rnq6Qq(ldVWsj+BbL(9sPs<24}8
zDHK!4#R(GP$c-f{Q)*m7!-X{QOS+Md6)6tGN$p5EpwK$d(40r8=B-7M{5h)1YRVH;
z7h;k%Y?uZa#dKEFDuIr*%vpk3x@#B)139kxE!f%yf^L%swMnxbX;G9e0H>VeHPOmz
za<rW)la0;KHF{2Jv0PR&j-^GF**$+MhD6V~t1P6NX5UZ3v*jz8q|j%ns`9`O3N!Co
z{{Utu@8JjgU4JgM42ueI734o%qxzb!W*p`Owu%ns<kM42l5dzpD}&G~=QX|qATA?@
zGwK&O{7!15(it1)c_#LDA%7}8np=lEC=DV)LbPiiqP)rv<<_M-O{_!r_mM^F0W5zl
zwIMHppzU7a2GROeS3#PqMGEbaPX?Bth~&JLy`xlL%$)v}T(+Et0?Oh+{?*7I$!fzW
z83XgEjAK96r{pv$+-d3ebco6S0H%{O)tI$d@HXkzqaKi{{$i;ZfuUo?3QYx(yKk#c
z{1HPR`>25b0O&QPZ+(3!0VF5iAOZOST;-}WN&p3z^ilp5(Az^KL<bmu?q}xwt5`|1
zIV5XDmyAaHoO<9@kqIAqW@@J#lE%0yKNCaWkTAmo>x$@28J1*)<mn>?_UTL!It+QA
zDD>izE46;iGt;$Br5}@2gfcnUj0IvpomdE13;?Io9Moj42@XPztPLt$CU${<c*QUo
zFfpNKf2(fBD~i-^{JiRsqh>RX8ztlS9kJJ*wcJjz$^e&<*b2whETEd%)DOHdo%j`<
zOQm#aO{?m8$NVMo>Junc*r<8!o;^Evt1GQG(%bft&PQtD{AHm{q}(*t*Do625w*6Q
z)g24OlFGu;{{Rf<g_sO(_HXg7RZ>q|o~1a!t622-JW40E%1ki8G;V_+k6+4&H95J<
z<&JXM;l84{S?~6IirZJ(tXgV}gzj&UbCPPKTU#>T9VK*jGhIRk=4x)A;xTJ<_U|_E
z!zelv+XLxdZGGZ<tw!_gx@wh(b)2qA`VPXo9|~wQ&2pDlo2G1KOLbqRTdA&xHCV+*
zQ_%DaMsnAaW6pEvK?Alcr6Zx;$o*=al8CLsg4_&xkxrG0?Jh<S@hx;iW6hSVv$$>f
zmybi*p`U7@DT2eMYBKIu1ExDs8h31C)OM;7og^~c0vbm87_<FFXWUy`T72?}3H1sT
ze=}Vx7}ZD5`f*TB=*hK$x*t(c<8h{Qe`>WneTqqa_o=G4T9u>#@6q`6bN>L=tD2pZ
z02YolUilnXH8suLE&Gd^P!C5x%cXP4wYj26l1btV5J*i5#Gm$?Kgd)++mOq_@b;K9
z-w*yYtEpZ$6Mdd>{{T&`(Z5_~t;H-ZM*S|t`>L{kCaAspeZ^YECAfk{{{WCKoVoN8
z5B-|0!XaWeX;y>KhLisQimeD@v6BI;UO#x|Kh#yo?JUpTl4L%TAS3ezj#uO^Vzdbc
zL47t1dWj?-)~}>?rx>tB_YPX6iq9YF<cY_iY<`BWMQG!nEr+%<R-@!AkVdigf;IYv
z{{R}aoiZYe@EQKKGQtmR8dirKfz2lo+-kl!F`)kdfYLK<ISPY6?9+n`;GTUADcR7R
zW}&81WL|pobYH_XjPVYre*;M>75aMAVY;g*Bh*$+wYj3O@7*x@b5TCn`P|t50CzMp
z;N*0m9E={7De5w`{!5N!QJ+w2J>mczvqnMx0BaRu_Acsq^sHIr*iLirO}lh7itItC
zS<NbvfgkAO{A-<>%JwNlk=wX&(62SMGi39~HH&to8*-eA!fi9AntK}dlN-xzlf=sS
z!z&DCqH8}Oh6L>=gXvi|((RR97=|9Ex)?6EW1Xx==Ugw`dTesZPYEt%@V~&W5wX2`
zcOI1<HqFT6KhCr~=~3{=j;FamKb2R#!I2qMC?gzqtCgDOOnA1cQRYU)q85!<9-_3N
ziH|3xW@&OO+W=fE1Kzdc+&JKmP;**Ebv1<fsis&&6(XSZrXgPDmx0l%WJ@O61xF!O
zQq8~vG}8F*RJA$8c5lxu@`|o&E1Ajirp|75GfdvD+kX;kplkbB)nh@zjicx*8%~T*
zew&9142Ss-=UP(PqjQien-6<AcQV39-Ew_MC)5`|D9u7|%!`7D@~IlokGMIitrY0T
zI3l%oJJp40QNis{825C3De^L&9S&-^8yJ8{tl6S5M2)`{W;t<CGLYO=n4!oS>s^x6
z@}z~{#M8?Vy(-jppssOGdu%Tvp+v^TD1K-5tE-H0P{#%&>BU<S;**ICi10H|u{B}G
z6&Ax%;h<Q6o@znbn<Qj&PfkFrlywBP9k)FNCc{<{IH&o1bgs1>4IEX9yJ@>{YRODf
zHiBvL+?ygb>rdO7aF3dlM^vem$Wvsfj|Q*2dF@r+6O7W8Fm%ef=AXqhsRtCJrDEV_
z;5%;>6Py}(YK4wkmnbL_BUQ)=HAx3GZXrU3r6?O#Wj6tlT@|q#KnA(JJ`K(}=DMpp
zZ<J=cscc$SE<&RowJQvY6+(;(c@@%@f@N1)gr-ga$*XPv``?hLBx8(p9)h97E>~nw
za?PLiSNv(4+oAx2{hq&{T6qNfRBpmC#YL<;jFEy3tJKmYVLoT&y*(+3o%@g8Jw2*M
z8TmyoF_1_Mt)9Z6Xn^fnA~30eP`r8hK;+Qk7c*WWLB&GQ5D7J|&|rI14FgC%Rmk<E
z;ukTV9`zrR-FsH~(@_9xnSyun&~9V?6rO6^{{Y9dOBw$A0MGjHKLJaVxcQ98beqYO
zf0QDN{(4Gwuj|&O`#zg&e|bAyIQ|&&6n|6sid%yu=X}o6A6IPC;nD6cFn0NA*K>~F
zPQTKfmql>(BmU7~JgZm)N_!_6`s0)MlUmwjZX;6I8~Q4659QXW<4x1f6^+FGK2RLz
z+xSQRJZZ0}T=~0Lt*-ty0%V>20mnaw$||FExSqQmD2aMG!Sn-~VYXKLi};h$vQtnH
zFo)$50rLiJ%s#l_A8u;eYA)Q6Saccx01DReBFvskR7K>HPi5ovsFfHlN~lhsH%e8%
ziH|2O-_oA*#@{f;@9#P3{(Y*Ix(H+3zTl^5QR!1hDgrPtPXir)3aYHE3n?r+zCS@o
zK|!2j>rUWm<y3Ra9>TIUJC%hAoaC=0DL5X3{Pm?5jHl(xIUKR<D+^JcS!dhHT;-3o
zWnNzHh0|K<b3Q7!ZBoU@mORpOr*BVT@7q6LO7myZ8pia)Alg9cJAEtGO~VPxDyeLF
zVe!BxuHVCoz}NJ#smAwE<sK1?H}MX5`qv~|XKW^-&LZ+*=HstwNhDXbc=uy>2kB8y
z8!f<64svU6MDYBV_X_d`STo7*_)!(9**b|tx#1rVu6))|a@(*+KZSb6lWMRgLXnOu
z2TzG$i)e@jPfoSF9nf_cZNv-^c*ytv02=CqQq=Nl)N*Fh$jU}a^y4G1(9_I{#H4_A
z$ie2an)mlx8Fw#noDuyqRNCQ*>db)hz;ZsKwz@_ZxzK&8D6@`n?Nx4XgJ3F;#2zaw
zz>T&*O9RG39DntVU4=efx!e!%p1=Kn!j53E5BfvNl-R)b546<ANzrj4$cO!^pJE`-
z8+(J>{3+gWh-Nn1>IvZg07_P4F?9$ekpnF63wCgRT>JV~6tcWhn6(R4k-xk~$Na|!
z@-^4Zd1ojwOo<zwIXEAVI(=|!D&pO3*h3-P5&N;+#!7#{Bu<r_<JAo_Dmxo1fIN$n
z6Mu5yarrRg`BeV^*>@Jo8RBi7nR#E#aaPk!gpJ7*!F^$MnSY`D>cz4xx&6(|$bB|r
z^v!3=x|MRYF>zO7OmIb8MvhVQh@ATEDAgc>KFsKQ2N@rjt5FrmQ9<B}wHVq)(~qSB
z+>xAT9oL$2f({SoPSz1^U9o~s<5GjWt`Ff=V>mS^$mAT;xC-&B4D27BR*_TgkmKBQ
zR>S9~2hyW(G03TuQyOJFi1GWuk46TV@fZ1{+JD_OG>VwUYLuu`l{l;<t-3W#A9{V|
zBBC;(1UK`hJkzKlWjO3}R3^WjPtP1|J=gk*&A6>W$*s#Xk>!w1M`Kubm;~pLYU5w7
zK6_VCd!zpVjZ~E_E(Qg|p*^HdKQUQ4)ZUjW6`;2<CN8XYoQlt$LO}#qK`xgmQqf5i
zfO<YZol}=joq-Q*{{T=kS+w4#bSg#c&Us}5UU;p0X}73c^Vs@T8>=%5eDS_LN}7GI
z%0}Afnz7LNXB#te+VCU!Q}?S^*5KJQ$Ije1`gf{pYPWEJcyaez(xbN2&85K8uZ9)Q
z(8soW{VE|FY0&cJ2SK<$f&K=Diy)DnGJ9lJ^h(Rn8s#*pzRjqrIq1a?@~+4*1LYN^
zHF4o*`@UYo(42EpF4NwkErCq?Kta;CH?_+_MPcjmfYHP|DPxa*D?{yy+z@LIQn-#;
zoaZ=feE<~|TI6;)u`Q?;G@D}3%`dA2U%+$vnzYb5pS{g3jv%p#7~!$F{ZBOcBva7S
zrxc)(wI$00i1R9rlq`hjate-Hhe7g;imunW2O|cl=xnaZriaZ0g#xHOqh#dqPHWd6
zbmpLv@vw4EO+&G2=6XU4XBB!Ya5=?A8lOtP82M|i$CW8+J_P{a)U!l3^r%Q+K^0+}
zF@k#2anO~iSs@rS;TgwDa<B)Ql}N#(mf+%0nlcSFBP8=zWC4vc8(OKr5-ZCG9V)+<
zwzXYDEmDziYnjH(IY+_B#Z9v#r9%<mRh4Yk$>=8{tFI=Ssot!zb4U;^X$XoV5i?Q*
zspCM^DT51|)YUSPk0zs$Fe>p(4n<a<g*1kp%&9=!YO_cIsb@TzsCnd8PCAOolI{ka
ziMZ8e3|g<j82}nwg>s?k4;<BqA<JT)8s@KC#sM62O;Zb4)YC22Fc^%JTai91D%<5y
zJNj0X1ChIv=sSIDc=kltynvp6T5~qh$LCDlwQ_2-vX(d%t?WgvIouOb^6gPb;YKP*
zPs^HHj>1Xuj8#Vknyo)I3F4wah}Ch=dVGAgYBF=c>U}DBlq&rEdk#$`40ARq(Zzwe
z&I#?_qw@(kQG$3F>T1r_iGO*^e{g?Vv>~N850yYs)!(3~-S+VjCBNsRb^HfvSCF{N
zV`%6HN^-zS8?)2yj=%kH#(@-!Wt$2MG5krVeUC4-*zW$~4bAxWAB|a(SmPUaaor<5
zey#o$PI(!VV-XBpe6$>&!@tzhX>}U7tu?)&{n`hYpa^#y`?vH__|+?^<d-7qQmfa<
zi~9B-j!j1`!3ShOpy|gfe;?>69yn}(H%sPE)LVm$f4)agpy%mO<=CqU8c!^2VA*MI
zygbB__-^V?u&S+aux8&&j!!Z8X~_NK`zZbq^yl8Jzngfgad{_~ZoEdSxX-9X>G<^e
z(xBgN&Zq1)JsH=TCH4oYKadp~y}z$d^DES4g{s?@6Ed`lH#BN;5PG|I`gN^LpowLb
zzWU`<3}t<Nm-*Ivl($Hb+u&pydlAWYJsw|QVc*zP$W5{}#moJo?nlR{-{?JY`3{_(
zmmM?{)V1Z6Pc^<w{lx>5{sN=&<X{2G&m>ebU9g-+b0qoYdf|_+y%S$5DL~uL@i6Fp
zhqYDEtXlh7KonyGxE(%~AKTVe!NMTwPp7q1lI0?ZF#x*{k$t^SxAUy|WK!5>C!TRb
zoq<HyH*xJUXXVcs{wkw9s^g!gJCjspxN^Z40FHf>RXHwLFH`<C5}IWwmmkVGEJgt!
z0s7S|oJkJEJ3;A>L;nEQs`K24*ACbKVB__wH(Hv-FP5is^2hoLl-EM6%$xfNUsp%K
z`T<>30yq?7g#eOArEt%vIleOIAUIm-Z8a#^FC*^Y=lRpUnR2^I=SK^Llc+p!YDBro
z+Sm<_IjkkP4yvf#K2y)-`qYN)m<GrD-pAITv@&ov;kwx5Fv#zY{{V$bE)+K8D<Ynl
zKQQ`w)(Dx{dgre+h#bkC$DTnQ>XkxyO{cwUd5J@u^#FDHj{gA8kl)E6%x8jlBiGQ1
zq=?QybDVTFY86G!;s`xy^Ea@pTZ>V+!C6-$x#&mZO`hf2CV0qX95)?4qM&=xYE&>1
z#heTr5%fKew@RDr=_)PVoXVu}mKa9=0A+{ppV0eLO^Br(!yl4W^J4zvbzh;aTS=vY
z0<e*DY5Y*iGFSXQqN<A{F=m1`j#U2uS#El-y84PC-yDIJim>$Hikq|>nu%#?bNkyz
zFu!=;oP7^p=T2cfKnce1dKVjk`ZxaoUZRdjr6bH9TlSF$ABg_|3c9jQ5c%-v-^REh
zzP*p*&04b&3IuLID1E+R{xxbfZM<QJw<3WAQUcNZg!f#P{6|sxih9P;kXUosf#Ci$
z>;#Au_9}frsd}6^4e#EJ=aK>U?@s_Qz~}kX5gx_Q4TJAPD=$4g4{Ccyk;hR{g2~gN
zpkjG|@J0;=Onu{#>rlxc-G@wiGXDUDFtA~TaL3T_e=3j?;jvWA!#hqX(#x>yV}s~x
zI_g#AWCPop&PK6Bva*Kc;<K)g3<n_h6%@BT6?1}m8qb#Lmw@|Eq3Kvk#?gsZO-fsN
z3iTd_q>Ea!Wg>XOvv3ckWna&|fo0m&o@LxNkSmg<C^vIYtHNClv@CO>@+AdNp{QCZ
zqZv76^;2BUyv+mQT&-!tsLvnXZNjm1?eOShJRFv=WyWdFe7o7=K83dr)}Rt<$o~NA
z(slPm&+?~@O4-NH^1gbSUotl!qbc;-YF$dP-Q0y%y&Wd6^8HJe+JyfAc0~aH0JsH7
zsM*1*+Ie$u<L*d0p{l-YyK%ceQAnDD#MpG;RQqSt*Xxwzttgo>tkx2&7bo|t-}}SV
zS9Pb}toCtB+>U;txx3+Os@wC3=TY)W>Hh%Mr)avnLpRuO6d_IqKcBr+WgFcy8gqpy
zHl5S_4zqsl)Yit{l?j*2#!f-#M|x{YPaxn{_eXMRtDzclpYqRv-mp_M+{YhX?~lW&
zt$T)H1F?S9ou#9BvP+EZ8$tBXTE<T89QBtySi|OSn1ggQ#zFz8)?5%WYF{}>t&>Mp
zh#oAid8(4ymd##iS3i4-X#n!CK+%lwv}|w<LeaE?fNN=#<D43gXvf{DaV<!#8jdQ;
zfxs028(S4R9&6CC<J#=7A{%p6<~um6VSr3lb=;WmOw}YPZuMpnxK%iQam8MZ3XIhx
zO9o6FQy6BP%t@;9GhpJ7>^)x`R8rJI*f%{X>`4{RQEE(;gkB9<i<+iMQJS#?!xZCU
zELqw!Lm(#<EUQS)H)K~tU6D#9s3QPXqPa?^6u7HNAS+jW1knVG!K~|%^6^*akaJnn
zwP7b<MlH_ZO-BrDdezyZ1m==89fd-b*;la|Ktawb%AKQ;QYvIqT~0!POCEp>lUg}g
zZbgWQHGbcw?1fRmW7GctuUSaZ#Bk?(kKj|l`k&}OI<}I$gfkrAa0dVnKs!}NUCK`8
z%ri3`v4R)79;f^(UIz-G4ozWMu{?CGc!&T2S8-Uh(lb+A5w7Mmo3<`VtgS-$E=Fm&
zB(BDy-iVTGwevdFyLn5D3g+NuxtwD-&2-k0VFuAt2{|#;ii#y}l?$jGxxn=ARvQb`
zlj~I^%f1Fn6Wgh)mWD`ZW_H?xYW}D3t4k3nfq<i{9)gh}TXoF8Gyec^9-mW8camod
z)b>>!K7xg&;K%u@hY*507d=nY(vU04`w(aQ$FTaEdczzm=1$*4Q_1{lyw?pN+5koD
zi_m|Gqix0LS~Ecd1&{Z#58~*f=}|P#BDPp64!wu}0A8XBv5dc8?*9OdK{Sl`T>f1J
z4ptq92}Vgy-^6<U9+g$M$sj~lc^u#Y$tI^%XT))?=RF7YsO6Du<7lnhbFjpd4*tvN
zKb=Dv6UeqXSQtIm@k%)n`!B!0<xpE%tcB;FCM4=J%bvsWrHT^Fs-rB&fm&uHfntoO
zn&0xu5&Ud=vwM1b4@yaMdJX7QFd3O9x&^o3MFMbKb^vx?=laubCNWO93qAJ<8X(F;
z=oEA<`VV?8FCWinpl>?rPnG3g{RKS-ss2K#s4pkkZCy;P2$thNDlc#kW%X|T2OV2>
z{=cud5iCj(=5sM(KjZ4@PDv;JS;l*l(?8I272Vmja&W717id-cw#nHC`(CH|)b}&T
zVGKzMkjg)Bj5%EOm529_Vc1q|5E6XWUop2~5dQ!h{{VpR_)=}HKd<Zh>^0ncB*PQC
z{G_g5)l~Ff_nY(|T8qxYeWFk>Psp8*{{UEj^}T5>SUkQ@m`qWU^2z=7{wDr{smDLs
zZnsE2MH{yE3Of(w_||RPLXs@YEAPvaT#e38sj8-9yr1rhagfQEWAia2{c1BG?xgp?
z{eSw^E$UK4Qe3x_D*|~qC)=<1R%2f|bqB6%LRiAvk?+Z>MgYx~91cZGk)lY+y}6c2
znRj(K{LeqqsXUT>qA5!sm#Fo?z%{oup7v(iJ&jVD*A3GY`HHeKUM=E8>6XZ9d~V2h
zDLZfmK9zi1mWyuRn2hF^>{-Ca7|kwVu^qM4u|DF&@%h&3-DC#OC#_^9xl^@);<TfS
z<-qJHQX)vcvX$x3boy0+D8~HYcly;V70<A!jK^yz=W>#`>ygj$%~~^Ny502v^Nf#r
zciIKaYysp9@%-xrZWt=CC*>i59jfv(M$2&#$2^bjpHHEyQX!)3R}J@E2UEkNe-P|P
zzrAQ$yV)?77B6o65&XgT{&kx!(9z0^`>8sAu-(-AdR5ywwljRYk+^uzfAi~9o}{y&
zj3mn&#Q5zS4oV;H`;YcdBer@~pDk{9nF7s>?p|^|bNoZ~6_*$`<IISOA(M<ZL+pM3
z0R3XMZRSa1$t9CHB%hfNx&Hv!zdG4FkU=C35g8ncu?sQ`smG}54O=K0J+geu*}%d7
z06(FqS_X#%9E|nsX*k<d1j?%XugLwe{VHhWAnkBD>*#)>qeT3v&mzW*(hwC!OCG|h
zF%|-;&JI0^;+j;W4CGT|aOxdT1Y{B1denkUt_}zts<dD@HzPegX&HNpP6+uA;A$tF
zCk>EA0~tJO=zAK?lpa-ptIsOGD*d`vRrG*IyCWTltmITtPJ8*ICN|pYIxK^5{{RtH
zKGkJ?`=&BlPC6X#L;Y$SjY2GeCJuNfy=Tg*?Y=TP<E?YfZ0U>(a_P5DrJ^ZtpWcM-
z$Klqo?st61<B`~oYJKt}?^k4ye-fM;mQ7vWJYK;a(vCMc!;kn7=CaaWMyl(~=OmR{
zLXsb$s3eEy-lnwfEwr22aRuXAOnOIjK3w}a{<Q?!UG<7@mKHG|`DmXqAH-I3i$<}M
zWgxu#!||&rfSqX&gZGC@n-P)Hu{OC%L?yDP@Tp*l@-e}u?86eLBCK2P4UjS`ITxji
zw&aW~pPg8He2j+3rHrJAGQ88g<UzHma-)`|8gtzhS{xpqh^X#ueE3H=b^ES5ioEK<
zH$~LeX}Xpvj`7c?Dt(ep=Pa-{)(pR+>cv~^SKsgx>+UOJWif>)+mMWdT(#0Q@~cKx
zITMgb{#AcX)THvv7YntA1x;xyCApeaFzepf(zuZQowH8U{!1~>EOKg{)9zJO9ZgFR
zlmG^BIIPpu-iPlhxvs{LAf9SB7#SHf*umhMm}j0ityZU7(4iyX^dgZaVmLLR@Z@~z
z6neK62l_N;fLQzRC~$j?NedIm+yyO8cSw$0?mmnDG#NK81_JsH1xCjM1PW=`R=E_a
z!Lv?{s?^bfII60IHF{3RlG;p|HbJVFF&8y<G|8!jzV2~UkzB<KWGE`@Nrm83#jZIO
z9MKWps=;y$oYN!(jwoY;)|{#c&S-HOWmQsJ6>vlfaw(}8-JI2snL&!mak%tmeCdJ3
zSB=O#R*cc*Y6xKj^sKpT#bPyQYItV{nz*+9!jCKi$gPsl5*Uw#sgitgOk`fw6DbEH
zr7^TgE9JV?8<sh#+c>JSw_{xOJ>6L?q%}tM=ARs3RA{Wtgc+%U5#w*p-lw4Y4)ox!
zsA$Q2ye{Toh<l#@072fW+)e_Oh!s=Szlpx5`U;Xl&Py{APjgT-d%9MwX`ooFgy)LU
zwPVI}QAchECafz3#sx}?MH;rdi5%coyfP}UT-QGYjB&}uX$b<x2{od2MDEG1?)IVP
zx%u7hB*4#FyELA2U{#wN09+Os_WuAH##fs<BGgeXk*AWNo)@|Gu7hY*3I+$I7V&LR
z0O!yOk@hFv`H%NUPsX~T9#nJAT+D!>U|S(qyC3t-MYt*35G$Upe)0bR3X(=?4?lqP
zrWFzJS9FJ>e~<W6WQ;4E5_YF!)Ssa5PiYnfASeufhp+wgd;L08GE7m98+50u`g;nh
zNhboO%E4jtOwQm8v$3U*hWU9Tt9pGu#;b<pfO_?BU;edB9b%5A_bS7j(;|(hauWa!
zS&sonsi7k?DUsjHgd7*gUtl|$Cc0lJMG)B#<SuiQ>Q8e=n-`$zrBH?#G9y2I&sYBd
z0qs>;K*j+lpaZ$3S1rPf=Yfv(D@-iTBKhu*%eZs1*?mvFE8KRl^(&_?Hi>WF<q?6A
z9(x~C?Nuk5K<#e}8E$0HohUz=gX!vjr8Jg}+6YK%wg=6VJVZN;`ud;1RaxC)fuVyt
zTZepL$^HxN`OxKM{eM^dfg-fB?ai&BGD#=PB&s=l{{RaA0JHr5m87X2muM5^d06w@
zI}YFODgOYzo7ht<EmF?k?X!cq#v>WQ!Swe&y@x%;RGLScW|8rnZ2)i-dJkb!vb!7C
zLKfX|cb0NV%R9H~O-}O&t^Ck7mr}5=bx!T|JpQ#_La0n`NI<@1<aOKGex&2`9+g<!
zJ;Sul0Q6DNW1t?wmF?<Ph%RK=8z|tDgZ2La>wj9W9D*xXex^p)K7*6}DK7W}C$>-H
zP+XFt7>Dr23HrAu`HITrq)+GYxg&v-_|t{RSXqh2SSb9BP@Y003O$GEPeLSxx8y1i
z#Qt=yGTNUu5xb9X#*jyhckfje;hoPI2a2^6z^)1U(irl9;Yh|tK9x>b0SbLjO1NJK
zIpowkx149AP&8ML05(2)91%`1g~obzse%o>9&?&=sPgDSY@pHYa%wfY2`E#+Clt3c
z43MqPDulU_44oZ#`ubHNGgRFyp@8e05!_T~_iY@bqPF5coo6dH^#`cyT5+j(fIfu&
zw9_SoX|m=>NcnjJuG=@3N=g2$xNmQIh(<j0az65@Kg;@6u@_&?t!_glfQY7RdANzV
zW7lt^52x}qWg>W8)rZWyb*RyI;Nz7TKH&7N2$V?}Q~t3BAMG#rkL6k=av1wNri$mu
zDjGJ7Gb!)RD_$7P5(y(YL!GDk8pMrH+rS;5&;C7Ic8|3PZdCOY;`%a8OL8{lN6Xz$
zZ~nDe357d(Q`8!x8yH`%RP;4$N;d9Z-9>Dla#{caWas7Yp4EXp%j<fGx3?r&!gB6E
zkNo!h>c*viB=dcbPI#K>2x*~i>U}f89;2=*-MkRl+n*r<sXLuVWBwkMokZpCt-6bL
zOeGPJG4lFT%*&A3>&AbjXWdDJGA}%4vredE&mf+KTcP$fT*+u?qS?7D+fG5+PwP-U
z>`r!1>BA19vYPYl7VRyoGX?wcH!PpxJ?kO$WRr1aZOPXiYFzA6r*om6IienHG6^Ca
z<$ABwRz>~B$c*XN_|bYdzCWc$ZK%kYlHOd7xXnuU7I*4*e&%EMiVTD9RDUt~8qKu!
zG>lq$ob;}fatKm-2IxoEx8qHP_Ayhq9_ndz3mNRnIr-sp@=AwtpT?qD0a20&9>3>`
z<dyHK)~#&|22<asOw|dl0x?23Jq=o53gA{f{^bOWj2ff}uX+`*>Um#UWv%VylL;*D
zWQ3lfTZ8o-4I;_rsa8F@)h1?wU9I;?;MBL(7MDhJF!+Kw+ZXzD5}t-ge{}x<rw8(@
zdvux3=x<)#i2Jt@@&5pQEpsICymR!aqhTUsAHt$j{$+DTPqR0lMZ9J6Y|K|9sSwH=
zKZjbX7UI@V-oY$H_H-ht+o=)^WlyN$wc@zcQ~?$A=sSGmkI2+}kH{Y~9Tn&+(*~q#
z#Q+Hq5$#9YFTnlix1avHK)L+JDl@5D^p{V!2itJ|wE0&?PQ8iZl_ZnoVh3!~+9+np
zW%-ACsxK|~Wh~?TBx=*JzPFqR<h6}GIVO{9t0G*{_pPCr<>7DwaaNbhnL%^~$A0y%
zGysp1IDqVSXZh7eRtE)$`f*uJC9^T9QZ*xDrt*>n!vF?BZYxe7nh_wCI()z!pVG2#
z?0(Y;8N&~{$F*kZTBNo&Ceq{;aC&w>N~pBa%kR<mmd9Nc-LYV#GoOE!ucW+XHmLR$
zk|U1b2_eRNf!?1M=+6ftnv+jsz7vH=-5N}>$lo!;d(~-dh~!|3P2`Z+%}TTG`Fdy8
zuIgKrs@SR*X~Co_ry$jdQ9;2K7QhB^QsZKiC4q#|gG7-5&MP;=_vvt(3FKDQzyT}O
zRPL^M3LVO*!*Bwx$AYzvB?$Pdu-x&AOG52p@dFi3bHNoapY2trW+x)8G$zW7gttn5
z&;>je^rUUoPQ=}fP!^OsX9Ad8Ja?%fQd>Bzw`RApB@wfVdO{Ruy<3R}-juRN3e#~X
zn5q~Krl68k+;C`>2q&7)bZz;`snHdwb9G3hHBb@}#Y-Icp|m>^%r=V5k%4}4eT7-N
zy2{{HTXgd!1lOHjZc|Z5HAi=3TUgo93}AW+O^m0dXhj%Q6v>wdfnD^~k)&f}Qpc@1
zW6dr(=~9njRLe+bmw{2^X{nCttuqy>T8X<BW0Vi2Ip=v!)jU>m%3ZdKzikYX<gNlQ
z-*z}v`kERQ%eJY2IOo*%u7b`VEdx2&r@88Wg0P{xg4~6-9&3)WxZr=fztq*pZk}c&
zxn}N3q>_nB?B5fNVK^SW%|uwc1zu0JQdv(3qbz$=i69>;9Ov&hudw#2=D4{_pKyG<
zx$ZlCKgO#{;Q>|Ze)E4ydrBAOA9(fls_t<^VzCj)Y*cT+z&v+8wJt^~76ph331h$m
zl1*mHAl>uhwM@u~iEfDtjHE!}pWONnf69^FEJ-9Rw_crAKk+}HsuHuv#pGhn+_5K<
zOKrn)OPlwd$3Uy{qNu=8>?$U9cUY4wNx%SPlj%@n=g(nQ$u<lkjZ0ri!Q07*AK@SE
zAJe5x+_hzFTSogJFh+OXH_FVs?9cG8zp1EYb8Y*W8BNmzzCd`0cftN`_zFltfEZ#S
z?q7#>JShI^pXO^O{^VraKyrkSF~{+rPxnV}&Z)jt{eM^dgq6(;Mn;K}AN514k%J-i
z_8x=r>sBqTqLJR><l@#2Tm7TiN7k!Y#>N=oOmBYYpdHEf9co*dmU$wzha%b&^5=rb
zyB~8xdR>FP?1Z|wdzNYB7#$UVx+=0D3%&m8jmO{Gq?LopRetL6MZ+q_sDAD@ztDF7
z06w&vySoc$W!)100J|sVW9CYI`wze$)|nh<810PI=^H$I-<Ob*YH=E^WxFWJxPZ$#
z#y^OKs4h?!cmDa~>Dsk0Jl1~72_KbI5VX(FK}n4n%>ii(<Ei4GBs(JV$EHuYt#7k|
z&NvlLRLr1z(Q6L(Gkl|fGm2K=f;x4puNDpm(xP~<4J!`iBP%yK?@%Sf36DH~TDj$6
zdT~_@ILV-7zi4n1vBqg7yJw8x^c>@{rj#Xz(yKP*ke<hiVj-GM#CA0|MnZZH4Kd@+
z2pwuv1&Zg6X^2A!S@G#wa4+2d0CyD-j51{X<p3Y2Z}6*;$AUe8swI;uaEGokKU%O8
z<_3{h<Q|2O^r_Sg1MgI3#fWWidV#_B6=j1&%E);fgHbAsb5lxpLBZq>D@ce~Wgpt?
z<NZ;$7#~*Teif|~sP9~@t>Z^^aIP`}ZeOVCKQ4a?)Pn3tnM26!LTg`8Q8&_c!B#wX
z{{R}RsOpDGw(|xaMmUX6M*jfoRamaG*QPtxQ(cWx=)&qmEO8Sac6y(o?@Dz!SD!*B
zriRvyZ4>X+Audd$kRR_M@6UXmhtjsvZRC|0an*ml^#1?~$cE?aWg`G8fwOOJJ5<xm
zs1|6+^0#1-)-&%_Pu*XcN)}coxrm6GTXXWBN_v0wtCqeVWW|(VHv|#T*H8A6Ri})C
z>Hf7xMjzf4%xd>7J@zcow&QUFo;`hswM|=C)|{;DX0_GDw7z6+%OK=)gHLk~v=xCC
zx`-bujuKPoeGlkrIbCNsVS;^oA4<;+hUeD@+PNJ@wCHxIC$CZBdz)K_j#OypPO5v=
zw4Eja<M|t@Uw>+qW`_PGhWW6{F%`}^UqIfY`Bh1+X0n*9s4{s39IFpdXl?Gc4K{n4
zkX}fkmvB~Zyc}*xJ(zSqTCF{t7j8wiEpGZ7c^l+^_Za^GBl()IygG5g6y;@_KPq0_
zD<0n8O0_GiG8}EHDJ+sjBSA7UNQVrLFjk?wD(SU{2fw9Qp7cj5+iGa?#-j-`{{XFs
zdmrLoWB6jBy0>(VU0r5@_>ST}RlSdJeUCIXZBWy`=Ra{9+eyL79%>huIQh9#S0{Wi
zNd_xAXz&R`$E8CyiY!FoSFS}^NXQ}MJa?>^q$tDRt!P0pLJ0gtVI!g_*|cC-1{J?+
z3a}N4WD;BvTaY6r2&~rRR4tov-E&puu!=+XdHPkz<R_9U!jO2tHKB#<U`wVL&&$%F
z0pNP*rHi`NIhHbT>4Q?@i&O44$512Wen-D*xZ7mpR#=5YmR74NBd<!yr?JO~qZvEL
zMq5-?7;(p~Qq=S)ucG0C%D-|!)PMD=<TnWTSK+;C$yQ&PO*p$4My#p(l@-y+>DSj9
zb=kDL1zk_zN2so@FqccZG6VM>wNC!V4Nl+6kZhHCj#1T3WLR9>X*Y(}@L}_i-JE)Z
zT3+htoBKD<P3Ux=GMpX=ps?@Or-h=uj5s5HKRQ;3fr6ZlwMks=r&=>tL@PKa2Bc6h
zN|E%Yv;zgUV-&&)jIME2siudTXxiDa)Iz&_q~g6AJM_4PST=Gi#xFrhOAdh6&W++X
zg#=Q$!02n!r&CKDT&JUzQzmNMN&@D&IJK2^1WBL9t;ei!_o^i;u{fP8N~_kH_Jk*s
zSifnKRNi=B=T&CAW#K{1E@LFMI%SqJDme~%S3hrgH*jkDU6a=$t`6pkD0}&WO!ljm
z((}(1Rc2Q1D#9ouiq$|;Pob)lAYQb|Za`CmS&q)0wNhKA8Km8a<uA!`o+<_e^HdV(
zm#b+u;aJGhC3zU5nv2Xku~J+H&T-bNO(&8vY9GQY?ulh!IVO?`f;i@@2Qr-YqiX^y
zl8q<N%*-K8xm}$E7h#85oJz0Ox$yEXHr2Rpj^GHc_?yXSY6t^29qHadQ|(T-V9E_k
zA;R^oc~KW-ciN(}ZjS7LII9;DVDfM(DC9{F2+bC;A!?RzMjvdKGm)KyY>f}O>feS1
zEyR=DW6pn(e(Y*Cs{H_~D-b;fDbEnaODP$V$sv3W6jqg`;gV5r1{sGW_8z33!n3A=
zX{GX^A2H`SzyrA_uoW$;BnV)DorLB|2*H&907V^#;%aO=of^!hEvVap`_KLF@D(gm
z2++uVaHG`LI!?^%1bz6cZ7LT4ACWj;e|k^3N%S<LlK{IOO)=&K^{nZxil_pc1IcqJ
zjX5~UB;Y9SN$e_lx)saR)rneS6$V(+&oF^XK-qFI(*FQO^d9}`pJ|OC@}O*?`_It7
z_z!BV(g_Fv({A3R+T=TMGfX`3PnE`YgTSX+qR8zeoI@Dg=hx7EPx#h$zJ&^->`;q>
zGc*4Hs9a|qw`2Kx)n{+8hEUnNek6S7U_DfO`k%wrrn|XloXpr4E<Qyc^2Kq-(Br3|
z_cfad@|W!`_p^wB_3qz?^ZHdcZ4i>arKzR+G=FDL^{>R3{t^A(@%*ZQi!x;N6zGm$
zapxzkXj;OF9G8;-VTYoh`Do{&{X3ujy+iky_9QR%TX7%!wv|DD_7k%GPgD3+WS3^<
z9eY)FndOn4iDqM#C%L3z#ACH5AV`|szG7JQIW)49BxLuIvG@=Ezw)GtJVwXx6>DHk
zqx*~c(4K%pDOeD9W6)G%I1JrSYJ7-gpK<3L3Y0=eBvQB)1Vw|KaezNsks%1RKEfkD
z<bPTiZCYsoQf`<PTgvLhRnZx9npq>l^rLVy3T26?WnAET)#r?=liZrDS9H0g1Pu-&
zHBI8j&{o68i0Xu?oG+=O!%V92;Dg$!%7Y*eTGn*=YMiNtUt><fxmX}B1xhXUu;YqU
z7@YnBtgX}>ZdsHM;xRTqJX5g6Sn$UqhF)saE0zjJY*O2`y~hzsu2-lBkSf~1JitKW
zor$>%0*Jp;{VGWFw1eE#(?$ps#auVg(qt2-DhI7Z(x@d#zz2i%qDI*0z5f8inxm*V
zSfq43vWCad=B+ZJ8?wFNk{f`*hVQ%&uealhjdcMAs9@-r00ZgNe>$xl>AQ#)z+^c3
zj%v12OK6?j_p7-#W3D+Lp{!K(b~dLCWSO$pb2B_jPRAI}r!^!6W6LW5K*H4s7{_d8
zla|L{TFGmvkg<IH-G3U*PnvgoBPc`KP0OM>s5N1DuWcrL^ce&GdLZ=oAIs@euCmaC
zAw+=uj)$X;<NDVQE+uA3B4U>)`CNBl-ns)JC~g>RE&(b1(~dx`-CJwas+OjU(&~=|
z*n4+%K-r00^N)Jfg61O;xwZ!f=9(eZPI`}Q{{TNjT)a*mLh>f~*kPBxJ-rQCQ7)Y_
zG;-!9`I205>5@M}Ju~f58k*hz0K*dqTUl;t-`K}6`)#2GPt7E2+tr7+sO?xXj7(T{
z?kiS1#<ZB+!Wm#MiDMj?9RC0?dwPy3x}}=QAKLAJ0AZaJ<Su#<>ZkPSMB#l=(+S%~
zP>GbQDEz9`z2c;QWw}EovE%{wqdu$t6_{pBkKsMOl<zlp%J|6cD>(b6k6p|6R-yp1
z#~^qg?n=E6vGhLGSg{<`mUmKHNb!&)g~!bsbZ`E>SeDv0h}_ze@P87Yyob>L0F7Ho
z>WNO;GdAH??mr>^{{UFq)7rBv?rwDHoFP)J#E{4`k`HnHdWzGYEDgZbS)&(7W$7aD
z`0wvZ?6i>Ut6P0W-R86mHVnxVa`Air0JOgU0G)GF#`fnp%AWPP3p^Jp;o3J-m61*e
zHDd18EkJ(kM-AJ5%RZ6Jf7$G<?a=neN-U+e<4wnNmH{LOvsCICRAaSAEwe)_0;L!n
zGLeD}IUMH|gwfR(bZzO<V5=#vrqV$m12{DS<{KI0#o4~cuAa(pR1EW3Ey<{Ed0Ac$
zHJ?1@2jz%0ZV9n~d8c06$8e(~9`&j@x|HG7j&RN~p7mkxV3ZO-IIdGtw79pB`DB%<
zme(>XVc3E@nym`uI_79(A1T^tqbff2P_nvN6bC%|invoehRCd>mCkv+Nl}Sx92&Q8
za`F79k?&c8Go8xZ`c=4#0hT<9o>Oi}$~KXrr6lD@2hi0^>sart%rh@eRlA|}6(lo8
zK~@LrPmo}6cQ5KGG}fr|>fxnL-WwQpH`f|n(6_li?!RUg*FkW%k%FuP{{T9|xw1>h
zN=P>;=togm_V+g$J<7#!#iBoQPR5AN#t@TE=<4_FB^Z$1!Jv*#R+|*C>JctmY=CgW
znTeosk{pWNO6Q|WqMGwOuTj&$fI5z%xl9z8fO;DA3*EnEz~|-SxOR^h1oq8#LF!=_
zdzyA$V8$5NNv6Wm&Y_9gIW>!?sK_F^T_|k@o2Oc-SSI;Z&3(PIg%s#*QbsuyS1%wq
zJOP@0ozY2f=e;DfxS_G99fB`WQ#GV&xxl9_!W7`uE8E?Xk_|LALv)K6EgN9gYU`3)
zOSQ&8tlPatc_b>J;;MOa%5r<vskks_ZD*;$dY}dX`c(5;2U10G_O|lI>e1vH>FsYM
zjxCdpeQRjSGeWiYDwu>)b5<jj7oICKRKJZ$!*f(^^$qR@>cVX&V!3Q|PcpibSl2Qc
zWhyg)TNX%@3^D6c-02F6TzzUR<*_X$W?RMb4At1})s1JHpCVv{a1Cl(OtOrMTWHGB
zXkCKI2<cC25T_std@{0u#aZ61)YZk3+9E{QF;QGewQ5#;bUCWe8_L}=S9U8S8uCdQ
zvNMXB!<PA~`&bt0I`^l=r?h94!1bkeJqgv6Eki3i{E>{ibm#Rx)opE9;3y=X0IHgk
z{hAhr5td<xVc*z$e>#~;%It|*6<+!{;xNVY=kLBTy#D})p!{jjj3Hyo8p0oFlH@MK
zjPqO3tbz~#!Nn-H^<>3gQSmpC9FqS45&2j8{{TvTxM?3G@j<m_DH%K<=jwSs&a`c9
zhbjmF46`1JJxxAq=!X)kTfFX7FaR5!*!-wHO;)(PRgBx)lD6RG*o-u%)cOxW+v`@Z
zuNC9_J-90XIVAOv`k&}KRz!=tipp~3*xfGVr&1vR6*O1JZtmxRD=I|@2{mHYHo#&q
z2TGMTXyUDNNJ`8A<gqvabtbeYxQ@aFjO}8C;B^v7(3Aek^!Fp{RICETlj<`PBDXsu
z!R4eQx40jcLG4+Eq`RG_kdAl?Iw&0g_7$ppvA<Si)!B77T}WBn{KT`l=L5N@u4MBZ
zU~Nz^tNqc~RE5M=qBQ%=es7@YKd<FblsiUqxcXEuA88X$I>f9L0lD3}B95f{3b}tj
zn(<r23c(o4sq4>k@7(=pD7J_!@s~e6$sXK<{5$)fYK3>R7jL}BHCrnScX8_&104r*
zO~GUGjy-C0h>@2)YGEHhI#k$WVThfodz_k>p?OFjjXp3Gk6%ijCFF5V#N1?d;Xv+b
z?GUR$Fe$7CD!ECyNzfv5>zbz~UBK)r+~l97S6I$@?^4P`qmPc9RC0`*Rjgqu76T-j
zVkt}jUwVe#4E)BdI++-HW~WHPrU0Z-PGrFLthq)VkEcr7n~()%yhM@@QfOd^f^Nk_
zG+9!8dRC8;RwCWU0+}>ncARwLlH3eHq&9sHYi7lPw@^C@&yq3bx(<pl^`}`*BZv&~
zlDztgWm`i0altWfGB9#*NZ{k}sGm=3W+uvKLDykyij8hIXKyD2fGMeam{G~~&$nt#
zs}VYPMC*Gn`7R?Hzq~l9qnRX?uyc?NRC&ORgUX(s)m+Fk^5kN(a%VeIi|$y|F19do
za4~`PteczXkr@fYDfv&YwON@Oc)$(F>6*{DP@GDbLbxZ;id@d;RT;#N+Dv?<af6dn
zt+q)+%I*wCdH{Jpk*xNWl58$ygpi?^oF9Km(zKP$z)TH}hwwk-3eVwoWmKOe<omD4
zexo2*B8w-J>B08;{{V$lkIR}8l4mEO@0yPGe9s^(c0bF{BO}t8r^76_PSPm_I486J
z0QKp`B^7bgwJKL-`xrcl+U%9xw5qn=fQ%o;oA#x+x;DV_Mh86)Z}aa?yE6TnQ3oul
zTmJyBj`*%h-f5wqd1fpdC!Vy{x|vgpto@_;9lSUMDH!cnQI}zcU-fKq5BCqR_|`Oc
z%QSFY=ZQY>{_BpvpI^qcB5>fTBC~D9gzwaBqH9Op8isUTu7{+t^~d-g+<H{8-53@*
zA(IiD`A);`=c=&u>__8Lq22^-jm!5K{{SQHQ29!Q;&6P4dV~BY*YN&TtSZ=0pK_k3
zX0jt(+eT4A+cZ7lZ|bM9tk~J3BX?ehHNS4MthfXVsD|I0kt%<5`q6WyLo{A|3AGPY
z?-}$y-iE46n^bkeX=ut2NT(U<D%=vuYcHD#>(q85(2Ati?yKvHK_jjJBryGIRIF&@
zb5?t3uHk8|hSCOJEcEvt)rtP5Z=0du`-;)DxoyFL*z3^L*S0Br8%YX<m~zpMhuwd|
zsq3oTOHCPxG&@|W<aGzuqTqsu<{0Q|(uVT3xcQ?v&$UvxQmwgjpK6DurL96)t@R9B
zUO$&O7>?<`MSoHL*HP{(BI4R>8@9K&LXJNp=t1s1>h08PaTee(%Si>uo)xpc4Dhe|
zxfEsLKHjV9zm+*PC9dVh@!Mf7&Hz1YR^5rgQb6xn%V#yL!5oHG8<EuITzZ3AF~q<J
zPNS`1%G0rKNt%}PI<O?>wG!_H2OL%`GRG?HakSOo14IC9k?TzvHFDL(ycZ*K^)-oi
zq{q0E!1k?JU87b<II9~LNb(NvtsY|7=Ha&cIsPIl%S*SAO<tPc4b$+dmlpX`fl~En
zB(;!5bGwHJ>MGnaIBcN>dxKa8;jN^@ZYx%j{{U_Z<chl;X=rbYsKNP$57MngaE(|o
zYH5%h40_d;R@fzt_bKO&)Fgz{HOK+CBMy61b3qN<O$?|odV~Ecj1z(eB*S_dnVhf8
za&u8CchwxVaLRhE8Je<O=$8Q`C9TQ%U(?vvQ)hn-^dTIKk@BBP%H18*Mk$wa5rdIg
zQbj(CD3UT{v?qi8+DX}*sY(*F(Uc~(W>rjcvaoLSZGgTjwuUhCl0`b|#yMjmyWpLT
zX}u3Tw-O}F7|$ma*F-#vTy!K;ZM1MNH<cJ(YSi!<4gkRDD4zB+t1f1d4V}@9zV3#Z
zHOi~*Q%;u2<4i9#NLor{<ndahYjm_VAiE|$rjev+4%6P28$yUF#0bbZ98i$kBU=S#
z$sE+*U>_&|3Q=~T;PFfLv5|(R;Kg*yw)Tn{c+M)uqX<|F4@z4(gGZbSTZj8Z1wRV7
zD<eod9%-vw%5E4htvg79>f&I;5V@{`eIX>cAdYig2AieD6e9{La$Jed<PlrX916+U
zQ`1(KBjwJgY2!7rn2cj@6<+GjDdRsTDX6&Wh*Cu<EaAIfF)`1jL1+!Mr5iZSGzs+i
zGP!8;(zY$u3(y<@8@*SvjP*llN!=30rIZjj^{1Vy)bm#&wAh#(DO%M}-XD!;$!06q
z&&VgeT(v;W$TemyJ~DV})3>n-1`S$^Tax6N3|5H4rA)R84;?D<LKIW57oLZ;4oFEd
zk+N993OPIz=~U#>;&aN@sW>#oHI$^8N(MjnMY<g98KrADR~&b(7Hk@?GH)0iNI3kd
zb96;G=wvVNrXc{y$t3Uq^)&_U$aMQW8S;V2Jqw>x{RKYb_i>n^IbY&$?;r52sik!U
z5JhFpt%3_t#n(G?ienB=1yar&xnf0HNkf2pQj*ZRnk$irH_8iitFgrMPaU)ix<@zz
z-kTWbC$(u<ui27oo|5@_AMEboN7VA)Ks{)u8?i=riJz%5hFdtFbQbwlBY@0)^L{7v
z0-!(<0Q~B2BQ5l#F?`snz1_>Tm4@dm39TzQOc9&L19&@O2m9UI@cwx8sBJA8dxTD&
zr1k@;tw}=2;GiNj8&Q1+`RDPgit1-B#&W~9H)G2=$3L0=l=O=ovTD;rb}l}(EKpz%
zS}X)97jyYj%5zqhFKSkgXz4|XxmGyIV^<=NVD~h{z#LTR&ot5#V0hxAZgM*gwIg@M
zCNP+xf+{k1rU<f0HFIF^%|{}f;8fUUcfx~I-!Js5(yth*{#njX0MN#Wl|qkt3+Jaw
zC19ORM9z3s7|$fqD#K(@P_f|VtFne|{+t?XyGbYsoq*tz-`<?qN7y<b`KetM%1J2!
zRYz{MTI7UWo^i<f(_QyIK{!1sDM6M7k%<eOl6b(S%nBtIT!!4=GjmWfHuR5?+~c73
zs}9k`iq1<i3=9rAsAjuWk~oTjoF2dDnwHHJM7RRk1dd5O<yWm%fQg}4W9|bEdHNsL
ztZFl!ahAr@i~>MEtz=kkwVPqaaN#}2T36^-EzzsLlNkg7(=^<L?^RlGU1S_&WQ_Ew
z6q4JRZBjU>XB9+FuJNIG_McU5L8P~kT!|u(scy<h`G>cyIxGjk9edSTVNo+HcjOV;
zr8@*|EzJA9Ijv+4OEMGkvZ!6Y#PzGT(g@{3!v*;O_Bh2$9npt$7emc~_|JS*StCJr
zZjeYGC2~O=@kG7r7g92mQkPDp1QM&3R7Tw2G8~m``qt_!cJOiXs2%ymQ?ZGbM`s`e
z9KULUd!uu3fk{}<{F&~3O;gu%N|gPpQClK8<(*{nK|kv<xZtJ*K_!z+rs6Ode;R>|
zA75(LSlp-%fzeOQzr?>o=~nGvNafud@Ou%~pEpCHMMY?1Y4AUnECdp+yOiUd%l&bh
z)3h!|(s>o4Z6jA(h(H7!vW}F=tkoZSa6PJJ8=_>Ci6WDJ6~6al>;C}Pt7Sy7^Y?{8
z6h2_d7(Ma)>dFQO=KJ5yx!P%*bEf056QN~SQ}f6Czxvf;8H%4d06nBb(EkASjSPSs
zj`YB$H^>|wzV%V5VqsUK8+l>~`$euyXR7y8=xZ_J>_!i*ZbIfKByy}UN2sP<S`H+P
z21C==>GiF-l$FlPbr!}M1qj>*M|!mj94jjj!;nX6ds~Cor@c4IXt`6Bq?ys93GKz%
zXN(1$j5-dEJxBOf45Y4Ah8P@Tw`I#HWjkDQ2<i=0x{f_Sj0H>0xyNZfp7o>mmzgWw
zentbGx!cgw5+ECJs|<x%mBu#l%_1(==kTmq&|S17qeS4nxj8C6@*nRnq5Tb8NZQec
z^S&1Qk^#dH=zXfRD&1Fx6>=N0xsFuxA%DKU{-fTSchQ-@Ab65h-3bK#wImX{0J3xa
zDjSO<3oF9^04s8iIv@VOwIo*=ZL)Gf`@)k=J%+5&qK>3s{Mf1O{!&7o4{E5YjKS29
zd;8V6Q;<Qd+a^sUOv)JNy+&<|j+v<obI8G|;Q6sp7kL~h-bPf~YBqeu{VOhAHSL>x
zV|0F%+e>V+AS82Cp2?FW46mWCo>QtbIL27rikz!et4Ur1jmK%=irKu<Bb09g2imim
z*b-tVYmS+zX`@vn(8<xk-x<j3iXgp}8|KL#^&~<_PT3bD)}JH^ox`E2a$~YAtby>#
z$DX35hIL%SCLIq2TB%1H54qG-&9*~^;r%FUnHf%;WUV6Yyd_(uSZKlRPK+O#pq^r!
z;=L$Z);#p|xZ@Ila5~cksK9Ns*zPx0qDEwkfyOFh9%(jZ8D3X1<hKr|HBoJUdC20n
zp4GMz1yG#%V<gs!HfGJ-U<j*%D<<u>u?t!|gr|d1NoH7s*A-HB98AQuc{w#Q*_G+m
zw7jsZjC86q+?b!I6)tYVvblEI+6YK)Y8bDCb27Q%jYOBK$&y7<xI*pbq4|0d=~ATb
zS}k1PM;!Uct(_+ICWrt=YlgkIo#pvKARe9S{*Ns7?g>^HJm$5h?-|ZVc>D~~q1>F-
zY<J>70X=H{w1M~oHBB@^Xs;!*$oo%2PhCD+XA(eir>LzjDM-!^eJNsWjiDS-9R{7A
zg_~2pKbwP6n2A4gBkNMyS~)v_I6aMOrK%1{=CpE8P;TaY8hSU)$Tgsmgct^%Kwbqj
zw>*mHoXSxH$0z>)*HJM!=A6x*wHux@ST<;o(%9=!vCn!!bKaQEg(uRY<=A0yAS51@
zUiHi{Iv>8sNzY;F{xp^kv`H!K#3((_{{UK{zJ2?H9F|f(yiv_%a$CAFyy7V)MV2~;
zc3aw~+45&Kj2jd+W*x*NHbLN1tcw?I!+}rKW{Eyo0blloL2L4iAue5j7_F1KG^rnT
znz69KW*Jq%BiGurUP88!D;&u=GdcYi^dGHdGXRAYj4x6<nzJ;!?g&%zqAHKT0YoKh
zqmqoTD-*;6u>-X{!}F8*Q#A9OlULzSI(1Rhwgpkt^d^okv)X4EdARy0{{R#GszgkI
zPo0CvcKy-vuh4X*jC_Rmtvrmp#Av}Ij+J6G80k^=%}*apQ(Kb9B6r6%9Fga(SqF-2
zfB~AT5>fJqH6Y+%Q;$6;E4GtBG>{HE(iWhpCyu6^Av_aOP~7Jjrn3Yd=B8x|ed;Zv
zIX!3@6E1l@Jt_%CIVYO6BZXS3=OnQoN{L9)F)2Ji9G*!%O;?jF%onFX2YR+on<hH@
z)pt=8g+Un|NUW}Ckr+W5<N=aDDu9pO=dMX4`qZ})0fb+=PtA`?h{!hWKg2RUu}6^X
zRGGwZ_AGsSdQuk>Mdi%cW&mv+MMrRakn%Twx=%s=H3X~!Jj5IxgQ4kCn=Z(!HPZd1
z)F{U8JMmdMg~OOh1mw8{dk$$WWRiQCi;O8?bM2fFO4jD&ODlcn$-wP`PxPR*WJNBj
z(g<S!Gco`P>JO%B>JRkTvJ05g1ZUGce?P*Zg3TehfT#@TEx)p2q*-H@NFcW5Oqr08
zjo;om^aB_k{<PPaJ9HR>Y=k2T9lmY&;~!7!Rug^zV1FL<OUVN9Pt3R^DIVMk(uizO
zcV{El^{H=DmaHs2axskLQqLcnpo7S#s^ojqwl;7&R&AV-c9BHk)W;w`$KI;>b6H$L
zDkveJYbo!?Ut?QP**wk)BL?^Ae!i6ss|+^LG>sva;a76`W8^?R`}M1YTh#ATQjaN_
z46|PsK@-NYovgm6*NVQD0H7^~$6;A|JaRM=GReo8jk)0Q*0n-OY(VHwJanp+u3U6l
z4vmaXayjU7dWx{E9}F@v)Ou4E7L;xs`x<h<$&s~{f#ci%0IgZ<TC*fco6J(AFwari
zr*|L+P%=0wdy0lM+S|TR-S&>vU@E#cK*(TeUEan{Nz}JtvF;dX$@+R#<#6aSpdV_k
ziGnuZA4--|(R|p+^rF`lW}Tw9Wa}7lfPS9FqhLn_Q5(lEsyC-<nTF)w+;G^e<pY|f
z8xR4-O%#E04=Q?!go20EQr0SNEXhg5*^ZXqoQB9f53NDvZ9dgzH{0@`x@pY0>-Z7g
zx+65sx^=o4w-)>oo%>c5^lW0>G94R&E2?xXK3d1WfCYb-2N}g~wmKs#nMo>YspZZ@
zcVJ*hcn|D-{p&JwJ*=%1szFnX8j|4|PU!meAW$!o@ddr(?P0`o5z6~FsHt|A`<pAT
zLgkIv0B+|se8dcUl;j?31jj9`-du`7?0XUFD^d%>suYZhh1rCcRA}3=SjsT-A#c8!
z$KpK=A%pBrtbj=i@~w`(-pAUi8B}1ddGAlOx-qjO<>6!VAwQ_}H2JNeE;lt`xwm`<
z8%<pTJ-;ce&$F&om9mY>Kd<;7YP5(^D;ZsGa#f=ktHrbuvu-@oV!4qR5CIvcIp(Lf
zi+9qZ7jHv~Dx-IcEvB?o;i5S9s}bs!(u@`=eZ^Li%rbnq$f|N$73eExIm2?@T9Mq@
z#7@~qQ}i`T_e~cADn4=0ip~*Ar{&1{RG(?NhxpIqN3~CMX&hH{Rhk*%;ei<ML^6;z
zN}!I|t7A}<j5@3RMMe@iB-~J`_NjA@>76mf(|eM_vB}D|-)gY-ir^3iM{!L^&>XI6
z!i)-!Xk%9uAF|6aTd)PNNc5_AknUlQ^if{SqLeOkrul4C7VORydz@EKYXYPoA4({w
zq40_c6PQCG=~#D`8{GDyirPx$r>RaG285DG+f%9u4il{uQbLn=8uo6@hmC;s6n|g~
zfC8{=^v@JgSIC+b+GEqf<nUuS{{UsGczhWoa)LQS)Hm{?ipi@x7k14v;M)nJRhBM5
zz&Yf8G}!zn4Y|US%kF5RlYFm1B$;7+HEaAxGCC1f;L)wovdRuR9+Xi_o>wcC7uLqy
ze+a1syi3tV6jO3xWIb@50HiWviYgqLN+@Y;ifK6*2Z|`EWmt`cLm}=@6j4yiMQI-&
zcA+vBQlsx<y%bh`N_#Sml)I6@KPY30t$%P)#|Mr@6jV0(OriWUI{-f&4<qSLGZaz@
z!J>*$9Ta8XymBe?7c~Z+!rm60TO^YlOYi(g@Em<lwG>v=x|`MbOY!7@xW^S@5tH?z
zik0ymWi2X~p!qZOtqT|Vq?7xL!S_6WQ$-cB-0P7ja4<45dJ3?I<i!+KAu+(@jC2(`
z9GWPmiDUP6rb!_u0*WfG#bFRrh&lG6icRP=kl+@MLH0CJQ+7Kcj2ND^8A5oXicOVb
zyoh`H)Jng+!Stev&F)$iZLyLE)YVy`4hX=_6jJCkQn|M(0o%qg!0S~OHd%&G1CUQ@
zD5+(klgEHS>6~Emilp|<f=d$Q^v^-<MHH^?RkSM`ADp>gmPI)Su;U-rszs-}x)mIz
zcIW%ub43(fYhr7CA>o4I)xLRxf)5~M<+%R<J!<8?lriUJLlKS#zcf)oVr-H0q*MjA
z<%Ti`e&(7iqf*SkWSryDiYSqVO^DqT9zRN!9itntXri-woD%qpg~q}F;QCcdn>gf!
z%!P|O?c3A}D6Jswc2hZx4>tEwMI&P*sXq15C`9=VHucCkZ~nCuR*!Q;D%(QNUKt5-
z*qXYNsX())03Cg(qLq<Isl)#DN66<Z^rA7AWyo>2rg;=mSlixa6t!|>k?m8tl#F*K
znA(lIzS$MWZNrg86uGRfZ9#L*x6ofR=E{*G_Bg>6rD*Eo89a`2MHO8Uw4$BH&mPvn
z%ChmE#CE1HBOa7dSk#KS$v7(x#wqCBVDzGj=T6T<o#^x#AS!YX<5ezp%7s`A;{ee`
zbyJhM(N0dsDRUD>=dsU9t1L+<Ag&Ls6jey{;O`rY9FnXaLH*({DW0qJG$iNlvXDg-
zQoCbm7ChEL>J3<qMJI6`Xrh#ajVnofx7tZ~U+*ucxc9ALl^aeONcmWI6j4zp?wq)t
zitaGgdc^QJ1R5x%OLLNeq|qy;Q@2{JBu|1BvONtHS6#F^8c%Yf+(`_ZWsP?FikoyS
zH~C^fdr?JZCqka~A~C>8A~L?&rwCN?2kS)@%Tb(_p%t+z#s_MC0HTVBE3=KC|Jir}
Bp^E?j

literal 0
HcmV?d00001

diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/v2/tests/test_image.py
new file mode 100644
index 0000000000..c78bbdc40a
--- /dev/null
+++ b/python/paddle/v2/tests/test_image.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.v2.image as image
+
+
+class Image(unittest.TestCase):
+    def test_resize_flip_chw(self):
+        # resize
+        im = image.load_image('cat.jpg')
+        im = image.resize_short(im, 256)
+        self.assertEqual(256, min(im.shape[:2]))
+        self.assertEqual(3, im.shape[2])
+
+        # flip
+        im = image.left_right_flip(im)
+        im2 = np.flip(im, 1)
+        self.assertEqual(im.all(), im2.all())
+
+        # to_chw
+        h, w, c = im.shape
+        im = image.to_chw(im)
+        self.assertEqual(c, im.shape[0])
+        self.assertEqual(h, im.shape[1])
+        self.assertEqual(w, im.shape[2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py
index 8320217da2..264442be18 100644
--- a/python/paddle/v2/tests/test_paramconf_order.py
+++ b/python/paddle/v2/tests/test_paramconf_order.py
@@ -27,7 +27,6 @@
 # limitations under the License.
 import unittest
 import math
-import paddle.dataset as dataset
 import paddle.v2 as paddle
 
 
@@ -41,7 +40,7 @@ def wordemb(inlayer):
 
 
 def train():
-    word_dict = dataset.imikolov.build_dict()
+    word_dict = paddle.dataset.imikolov.build_dict()
     dict_size = len(word_dict)
     # Every layer takes integer value of range [0, dict_size)
     firstword = paddle.layer.data(
diff --git a/python/setup.py.in b/python/setup.py.in
index d73a3a6a1c..08a448934d 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -77,6 +77,8 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                'paddle.v2',
                'paddle.v2.master',
                'paddle.v2.plot',
+               'paddle.v2.reader',
+               'paddle.v2.dataset',
                'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:

From f801f743d165784f0beb396d84ac4936c1c0102a Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Mon, 2 Apr 2018 13:22:37 -0700
Subject: [PATCH 45/57] fix the __all__ in dataset package

---
 python/paddle/v2/dataset/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index c1acbecd9c..38056fe0a9 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -36,7 +36,7 @@ __all__ = [
     'cifar',
     'movielens',
     'conll05',
-    'sentiment'
+    'sentiment',
     'uci_housing',
     'wmt14',
     'wmt16',

From 974908f766748aab52e06d10ea592898be029d82 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Mon, 2 Apr 2018 16:46:10 -0700
Subject: [PATCH 46/57] TeamCity build: handle case when WITH_FLUID_ONLY=OFF

---
 paddle/scripts/docker/build.sh | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index f916295cd7..4885b74e6c 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -104,7 +104,9 @@ EOF
         # make install should also be test when unittest
         make install -j `nproc`
         pip install /usr/local/opt/paddle/share/wheels/*.whl
-        paddle version
+        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
+            paddle version
+        fi
     fi
 }
 
@@ -183,6 +185,14 @@ EOF
         NCCL_DEPS=""
     fi
 
+    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
+        PADDLE_VERSION="paddle version"
+        CMD='"paddle", "version"'
+    else
+        PADDLE_VERSION="true"
+        CMD='"true"'
+    fi
+
     cat >> /paddle/build/Dockerfile <<EOF
     ADD python/dist/*.whl /
     # run paddle version to install python packages first
@@ -192,7 +202,7 @@ EOF
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
-        paddle version && \
+        ${PADDLE_VERSION} && \
         ldconfig
     ${DOCKERFILE_CUDNN_DSO}
     ${DOCKERFILE_GPU_ENV}
@@ -200,7 +210,7 @@ EOF
     ADD go/cmd/pserver/pserver /usr/bin/
     ADD go/cmd/master/master /usr/bin/
     # default command shows the paddle version and exit
-    CMD ["paddle", "version"]
+    CMD [${CMD}]
 EOF
 }
 

From faa752a471e6a84c48c95d197e7baab1b9d9fc67 Mon Sep 17 00:00:00 2001
From: jasontangjs <jason.tangjs@gmail.com>
Date: Tue, 3 Apr 2018 08:54:36 +0800
Subject: [PATCH 47/57] update index_en.rst (#9400)

* update index_en.rst

* fix comments

* fix uppercase
---
 doc/v2/faq/build_and_install/index_en.rst | 146 +++++++++++++++++++++-
 1 file changed, 142 insertions(+), 4 deletions(-)

diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst
index 614db457d7..7488ed8137 100644
--- a/doc/v2/faq/build_and_install/index_en.rst
+++ b/doc/v2/faq/build_and_install/index_en.rst
@@ -1,5 +1,143 @@
-############################
-Install, Build and Unit test
-############################
+.. _install_faq:
 
-TBD
+###############################
+Compile, Install, and Unit Test
+###############################
+
+..  contents::
+
+1. Insufficient CUDA driver version
+----------------------------------------------------------------
+
+Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
+You can solve the issue by running the following commands:
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_en.html>`_ .
+
+
+2. Version mismatch between PythonLibs and PythonInterpreter
+----------------------------------------------------------------
+
+It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+You should specify ``<exc_path>``, ``<lib_path>``, ``<inc_path>`` to your local paths.
+
+3. PaddlePaddle version is 0.0.0
+------------------------------------------------
+This issue would happen when you run the code  `paddle version` or `cmake ..`
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
+
+You can upgrade Pip with the following command\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
+
+If the system supports :code:`linux_x86_64` and  the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest 
+
+if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
+
+
+5. ImportError: No module named v2
+----------------------------------
+Please uninstall Paddle V1 if you have installed it before.
+
+..  code-block:: bash
+
+    pip uninstall py_paddle paddle
+
+Then install Python for PaddlePaddle , enter the build directory and run the following commands
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. Illegal instruction
+-----------------------
+This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
+
+7.  Python unittest fails
+--------------------------------
+
+If the following python unittest testcases fail:
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+Please check the PaddlePaddle unittest logs which may suggest the following:
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+The solution is:
+
+* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory.  Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
+
+
+8. Failed to download the MKLML library
+----------------------------------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
+    make[1]: *** waiting for the unfinished  jobs....
+
+Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
+
+The solution is: manually download and install, the specific steps are as follows.
+
+..  code-block:: bash
+
+    // 1. enter the directory
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. check the size of the package, normally 75M, if less than 75M, the download fails
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. manually download and unzip and make the download success tag:
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. then compile
+    

From 172c887d1c838aa3df9e00e355d9b9c12d930f6b Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Tue, 3 Apr 2018 10:38:50 +0800
Subject: [PATCH 48/57] init (#9462)

---
 benchmark/fluid/machine_translation.py  | 349 ++++++++++++++++++++++++
 benchmark/fluid/mnist.py                | 205 ++++++++++++++
 benchmark/fluid/resnet.py               | 323 ++++++++++++++++++++++
 benchmark/fluid/run.sh                  |  49 ++++
 benchmark/fluid/stacked_dynamic_lstm.py | 209 ++++++++++++++
 benchmark/fluid/vgg.py                  | 220 +++++++++++++++
 6 files changed, 1355 insertions(+)
 create mode 100644 benchmark/fluid/machine_translation.py
 create mode 100644 benchmark/fluid/mnist.py
 create mode 100644 benchmark/fluid/resnet.py
 create mode 100644 benchmark/fluid/run.sh
 create mode 100644 benchmark/fluid/stacked_dynamic_lstm.py
 create mode 100644 benchmark/fluid/vgg.py

diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py
new file mode 100644
index 0000000000..cc31d09832
--- /dev/null
+++ b/benchmark/fluid/machine_translation.py
@@ -0,0 +1,349 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""seq2seq model for fluid."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import distutils.util
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--embedding_dim",
+    type=int,
+    default=512,
+    help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+    "--encoder_size",
+    type=int,
+    default=512,
+    help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--decoder_size",
+    type=int,
+    default=512,
+    help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--batch_size",
+    type=int,
+    default=16,
+    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    "--dict_size",
+    type=int,
+    default=30000,
+    help="The dictionary capacity. Dictionaries of source sequence and "
+    "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+    "--pass_num",
+    type=int,
+    default=2,
+    help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=0.0002,
+    help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+    "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+    "--beam_size",
+    type=int,
+    default=3,
+    help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+    "--use_gpu",
+    type=distutils.util.strtobool,
+    default=True,
+    help="Whether to use gpu. (default: %(default)d)")
+parser.add_argument(
+    "--max_length",
+    type=int,
+    default=250,
+    help="The maximum length of sequence when doing generation. "
+    "(default: %(default)d)")
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size, max_length):
+    """Construct a seq2seq network."""
+
+    def bi_lstm_encoder(input_seq, gate_size):
+        # Linear transformation part for input gate, output gate, forget gate
+        # and cell activation vectors need be done outside of dynamic_lstm.
+        # So the output size is 4 times of gate_size.
+        input_forward_proj = fluid.layers.fc(input=input_seq,
+                                             size=gate_size * 4,
+                                             act=None,
+                                             bias_attr=False)
+        forward, _ = fluid.layers.dynamic_lstm(
+            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+        input_reversed_proj = fluid.layers.fc(input=input_seq,
+                                              size=gate_size * 4,
+                                              act=None,
+                                              bias_attr=False)
+        reversed, _ = fluid.layers.dynamic_lstm(
+            input=input_reversed_proj,
+            size=gate_size * 4,
+            is_reverse=True,
+            use_peepholes=False)
+        return forward, reversed
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward, src_reversed = bi_lstm_encoder(
+        input_seq=src_embedding, gate_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward, src_reversed], axis=1)
+
+    encoded_proj = fluid.layers.fc(input=encoded_vector,
+                                   size=decoder_size,
+                                   bias_attr=False)
+
+    backward_first = fluid.layers.sequence_pool(
+        input=src_reversed, pool_type='first')
+
+    decoder_boot = fluid.layers.fc(input=backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
+                                    decoder_boot, decoder_size):
+        def simple_attention(encoder_vec, encoder_proj, decoder_state):
+            decoder_state_proj = fluid.layers.fc(input=decoder_state,
+                                                 size=decoder_size,
+                                                 bias_attr=False)
+            decoder_state_expand = fluid.layers.sequence_expand(
+                x=decoder_state_proj, y=encoder_proj)
+            concated = fluid.layers.concat(
+                input=[encoder_proj, decoder_state_expand], axis=1)
+            attention_weights = fluid.layers.fc(input=concated,
+                                                size=1,
+                                                act='tanh',
+                                                bias_attr=False)
+            attention_weights = fluid.layers.sequence_softmax(
+                input=attention_weights)
+            weigths_reshape = fluid.layers.reshape(
+                x=attention_weights, shape=[-1])
+            scaled = fluid.layers.elementwise_mul(
+                x=encoder_vec, y=weigths_reshape, axis=0)
+            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+            return context
+
+        rnn = fluid.layers.DynamicRNN()
+
+        cell_init = fluid.layers.fill_constant_batch_size_like(
+            input=decoder_boot,
+            value=0.0,
+            shape=[-1, decoder_size],
+            dtype='float32')
+        cell_init.stop_gradient = False
+
+        with rnn.block():
+            current_word = rnn.step_input(target_embedding)
+            encoder_vec = rnn.static_input(encoder_vec)
+            encoder_proj = rnn.static_input(encoder_proj)
+            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+            cell_mem = rnn.memory(init=cell_init)
+            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
+            decoder_inputs = fluid.layers.concat(
+                input=[context, current_word], axis=1)
+            h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+            rnn.update_memory(hidden_mem, h)
+            rnn.update_memory(cell_mem, c)
+            out = fluid.layers.fc(input=h,
+                                  size=target_dict_dim,
+                                  bias_attr=True,
+                                  act='softmax')
+            rnn.output(out)
+        return rnn()
+
+    if not is_generating:
+        trg_word_idx = fluid.layers.data(
+            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+        trg_embedding = fluid.layers.embedding(
+            input=trg_word_idx,
+            size=[target_dict_dim, embedding_dim],
+            dtype='float32')
+
+        prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
+                                                 encoded_proj, decoder_boot,
+                                                 decoder_size)
+        label = fluid.layers.data(
+            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+
+        return avg_cost, feeding_list
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    lod_t = core.LoDTensor()
+    lod_t.set(flattened_data, place)
+    lod_t.set_lod([lod])
+    return lod_t, lod[-1]
+
+
+def lodtensor_to_ndarray(lod_tensor):
+    dims = lod_tensor.get_dims()
+    ndarray = np.zeros(shape=dims).astype('float32')
+    for i in xrange(np.product(dims)):
+        ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+    return ndarray
+
+
+def train():
+    avg_cost, feeding_list = seq_to_seq_net(
+        args.embedding_dim,
+        args.encoder_size,
+        args.decoder_size,
+        args.dict_size,
+        args.dict_size,
+        False,
+        beam_size=args.beam_size,
+        max_length=args.max_length)
+
+    # clone from default main program
+    inference_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    def do_validation():
+        total_loss = 0.0
+        count = 0
+        for batch_id, data in enumerate(test_batch_generator()):
+            src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
+            trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
+            lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
+
+            fetch_outs = exe.run(inference_program,
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost],
+                                 return_numpy=False)
+
+            total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
+            count += 1
+
+        return total_loss / count
+
+    for pass_id in xrange(args.pass_num):
+        pass_start_time = time.time()
+        words_seen = 0
+        for batch_id, data in enumerate(train_batch_generator()):
+            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
+            words_seen += word_num
+            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
+            words_seen += word_num
+            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
+
+            fetch_outs = exe.run(framework.default_main_program(),
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost])
+
+            avg_cost_val = np.array(fetch_outs[0])
+            print('pass_id=%d, batch_id=%d, train_loss: %f' %
+                  (pass_id, batch_id, avg_cost_val))
+
+        pass_end_time = time.time()
+        test_loss = do_validation()
+        time_consumed = pass_end_time - pass_start_time
+        words_per_sec = words_seen / time_consumed
+        print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
+              (pass_id, test_loss, words_per_sec, time_consumed))
+
+
+def infer():
+    pass
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    if args.infer_only:
+        infer()
+    else:
+        train()
diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
new file mode 100644
index 0000000000..7f7afaeb11
--- /dev/null
+++ b/benchmark/fluid/mnist.py
@@ -0,0 +1,205 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    test_pass_acc = fluid.average.WeightedAverage()
+    for batch_id, data in enumerate(test_reader()):
+        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
+                                data)).astype(DTYPE)
+        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+        y_data = y_data.reshape([len(y_data), 1])
+
+        acc, weight = exe.run(inference_program,
+                              feed={"pixel": img_data,
+                                    "label": y_data},
+                              fetch_list=[batch_acc, batch_size_tensor])
+        test_pass_acc.add(value=acc, weight=weight)
+        pass_acc = test_pass_acc.eval()
+    return pass_acc
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    start_time = time.time()
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+    opt.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        pass_start = time.time()
+        for batch_id, data in enumerate(train_reader()):
+            img_data = np.array(
+                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([len(y_data), 1])
+
+            start = time.time()
+            outs = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+            accuracy.add(value=outs[1], weight=outs[2])
+            end = time.time()
+            loss = np.array(outs[0])
+            acc = np.array(outs[1])
+            print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+                  (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+
+        pass_end = time.time()
+
+        train_avg_acc = accuracy.eval()
+        test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
+                                 inference_program)
+
+        print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
+              (pass_id, train_avg_acc, test_avg_acc,
+               (pass_end - pass_start) / 1000))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(cnn_model, args)
+    else:
+        run_benchmark(cnn_model, args)
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
new file mode 100644
index 0000000000..f0f1db979f
--- /dev/null
+++ b/benchmark/fluid/resnet.py
@@ -0,0 +1,323 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import numpy as np
+import time
+
+import cProfile, pstats, StringIO
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Convolution model benchmark.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=['resnet_imagenet', 'resnet_cifar10'],
+        default='resnet_imagenet',
+        help='The model architecture.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='use real data or fake data')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+    else:
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+
+    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    predict = model(input, class_dim)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(dshape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"data": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+
+        return test_accuracy.eval()
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    accuracy = fluid.average.WeightedAverage()
+    if args.use_fake_data:
+        data = train_reader().next()
+        image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
+            'float32')
+        label = np.array(map(lambda x: x[1], data)).astype('int64')
+        label = label.reshape([-1, 1])
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            if not args.use_fake_data:
+                image = np.array(map(lambda x: x[0].reshape(dshape),
+                                     data)).astype('float32')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+                label = label.reshape([-1, 1])
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={'data': image,
+                      'label': label},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            iters += 1
+            num_samples += label[0]
+            accuracy.add(value=acc, weight=weight)
+            train_losses.append(loss)
+            train_accs.append(acc)
+            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
+                  (pass_id, iters, loss, acc))
+        pass_train_acc = accuracy.eval()
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        train_elapsed = time.time() - start_time
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+        examples_per_sec = num_samples / train_elapsed
+
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+
+    if args.use_cprof:
+        pr.disable()
+        s = StringIO.StringIO()
+        sortby = 'cumulative'
+        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+        ps.print_stats()
+        print(s.getvalue())
+
+
+if __name__ == '__main__':
+    model_map = {
+        'resnet_imagenet': resnet_imagenet,
+        'resnet_cifar10': resnet_cifar10
+    }
+    args = parse_args()
+    print_arguments(args)
+    if args.data_format == 'NHWC':
+        raise ValueError('Only support NCHW data_format now.')
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(model_map[args.model], args)
+    else:
+        run_benchmark(model_map[args.model], args)
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
new file mode 100644
index 0000000000..663e2efd53
--- /dev/null
+++ b/benchmark/fluid/run.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# This script benchmarking the PaddlePaddle Fluid on
+# single thread single GPU.
+export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
+
+# disable openmp and mkl parallel
+#https://github.com/PaddlePaddle/Paddle/issues/7199
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+        export OMP_DYNAMIC="FALSE"
+    fi
+else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+fi
+# disable multi-gpu if have more than one
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
+
+
+# vgg16
+# cifar10 gpu cifar10 128
+FLAGS_benchmark=true python fluid/vgg.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30  \
+               2>&1 > vgg16_gpu_128.log
+
+# resnet50
+# resnet50 gpu cifar10 128
+FLAGS_benchmark=true python fluid/resnet.py \
+               --device=GPU \
+               --batch_size=128 \
+               --data_set=cifar10 \
+               --model=resnet_cifar10 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 > resnet50_gpu_128.log
+
+# lstm
diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000..4e063549e0
--- /dev/null
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -0,0 +1,209 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import time
+
+import numpy
+import paddle.v2 as paddle
+import paddle.v2.dataset.imdb as imdb
+import paddle.fluid as fluid
+from paddle.v2 import batch
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--emb_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=100,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='CPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--crop_size',
+        type=int,
+        default=int(os.environ.get('CROP_SIZE', '1500')),
+        help='The max sentence length of input. Since this model use plain RNN,'
+        ' Gradient could be explored if sentence is too long')
+    args = parser.parse_args()
+    return args
+
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+    unk_value = word_dict['<unk>']
+
+    def __impl__():
+        for item in reader():
+            if len([x for x in item[0] if x != unk_value]) < crop_size:
+                yield item
+
+    return __impl__
+
+
+def main():
+    args = parse_args()
+    lstm_size = args.hidden_dim
+
+    data = fluid.layers.data(
+        name="words", shape=[1], lod_level=1, dtype='int64')
+    sentence = fluid.layers.embedding(
+        input=data, size=[len(word_dict), args.emb_dim])
+
+    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(
+                ipt,
+                hidden,
+                size, ):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            gate = fluid.layers.sums(input=[gate0, gate1])
+            return gate
+
+        forget_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        input_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        output_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        cell_gate = fluid.layers.tanh(
+            x=gate_common(word, prev_hidden, lstm_size))
+
+        cell = fluid.layers.sums(input=[
+            fluid.layers.elementwise_mul(
+                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+                    x=input_gate, y=cell_gate)
+        ])
+
+        hidden = fluid.layers.elementwise_mul(
+            x=output_gate, y=fluid.layers.tanh(x=cell))
+
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_pool(rnn(), 'last')
+    logit = fluid.layers.fc(input=last, size=2, act='softmax')
+    loss = fluid.layers.cross_entropy(
+        input=logit,
+        label=fluid.layers.data(
+            name='label', shape=[1], dtype='int64'))
+    loss = fluid.layers.mean(x=loss)
+
+    # add acc
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                shape=[1], dtype='int64'), total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    adam = fluid.optimizer.Adam()
+    adam.minimize(loss)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    def train_loop(pass_num, crop_size):
+        with profiler.profiler(args.device, 'total') as prof:
+            for pass_id in range(pass_num):
+                train_reader = batch(
+                    paddle.reader.shuffle(
+                        crop_sentence(imdb.train(word_dict), crop_size),
+                        buf_size=25000),
+                    batch_size=args.batch_size)
+                word_nums = 0
+                pass_start_time = time.time()
+                for batch_id, data in enumerate(train_reader()):
+                    tensor_words = to_lodtensor([x[0] for x in data], place)
+                    for x in data:
+                        word_nums += len(x[0])
+                    label = numpy.array([x[1] for x in data]).astype("int64")
+                    label = label.reshape((-1, 1))
+                    loss_np, acc, weight = exe.run(
+                        fluid.default_main_program(),
+                        feed={"words": tensor_words,
+                              "label": label},
+                        fetch_list=[loss, batch_acc, batch_size_tensor])
+                    print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
+                          (pass_id, batch_id, loss_np, acc))
+
+                pass_end_time = time.time()
+                time_consumed = pass_end_time - pass_start_time
+                words_per_sec = word_nums / time_consumed
+                print("pass_id=%d, sec/pass: %f, words/s: %f" %
+                      (pass_id, time_consumed, words_per_sec))
+
+    train_loop(args.pass_num, args.crop_size)
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
new file mode 100644
index 0000000000..3bf78e4cf0
--- /dev/null
+++ b/benchmark/fluid/vgg.py
@@ -0,0 +1,220 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    # test
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"pixel": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+        return test_accuracy.eval()
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            accuracy.add(value=acc, weight=weight)
+            iters += 1
+            num_samples += len(data)
+            print(
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss, acc)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        pass_train_acc = accuracy.eval()
+        train_losses.append(loss)
+        train_accs.append(acc)
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        train_elapsed = time.time() - start_time
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    main()

From d0ac92531dacfa09c8142ad515eb04ee9a8a8ef9 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sun, 1 Apr 2018 02:35:07 -0700
Subject: [PATCH 49/57] Improve ParallelExecutor performance

---
 .../details/nccl_all_reduce_op_handle.cc      |  2 +-
 .../details/nccl_all_reduce_op_handle.h       |  5 ++
 .../fluid/framework/details/op_handle_base.h  |  4 ++
 .../details/threaded_ssa_graph_executor.cc    | 49 +++++++++++++++----
 .../details/threaded_ssa_graph_executor.h     | 12 ++++-
 paddle/fluid/framework/parallel_executor.cc   |  2 +
 python/paddle/fluid/parallel_executor.py      |  3 +-
 7 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 5ddf331cfc..55b5f11358 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -76,7 +76,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
   }
 }
 
-std::string NCCLAllReduceOpHandle::Name() const { return "NCCL AllReduce"; }
+std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index 045070bb6a..3d61fa79f7 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -34,6 +37,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  bool IsDelayedOp() override { return true; };
+
  protected:
   void RunImpl() override;
 };
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 71672fd24c..54c2d627ff 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <string>
+#include <vector>
 
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -53,6 +55,8 @@ class OpHandleBase {
 
   void AddOutput(VarHandleBase *out);
 
+  virtual bool IsDelayedOp() { return false; }
+
  protected:
   virtual void RunImpl() = 0;
 };
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 3f8655147b..075eed4ecc 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -29,17 +29,27 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
       local_scopes_(local_scopes),
       places_(places),
       fetch_ctxs_(places),
-      use_event_(use_event) {}
+      use_event_(use_event),
+      running_ops_(0) {}
+
+void ThreadedSSAGraphExecutor::RunDelayedOps(
+    const std::unordered_set<OpHandleBase *> &delayed_ops) {
+  for (auto op : delayed_ops) {
+    op->Run(use_event_);
+  }
+}
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
-
   BlockingQueue<VarHandleBase *> ready_vars;
-
   std::unordered_set<OpHandleBase *> ready_ops;
 
+  std::unordered_set<OpHandleBase *> delayed_ops;
+  std::unordered_set<OpHandleBase *> after_delayed_ops;
+  std::unordered_set<VarHandleBase *> delayed_vars;
+
   auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
     pending_vars.insert(&var);
     if (var.generated_op_ == nullptr) {
@@ -106,7 +116,14 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
   auto run_all_ready_ops = [&] {
     for (auto *op : ready_ops) {
-      RunOp(ready_vars, op);
+      if (op->IsDelayedOp()) {
+        delayed_ops.insert(op);
+        delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
+        ready_vars.Extend(op->outputs_);
+        continue;
+      }
+      running_ops_++;
+      RunOp(&ready_vars, op);
     }
     ready_ops.clear();
   };
@@ -124,7 +141,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
     // 2. Find ready variable
     bool timeout;
-    auto cur_ready_vars = ready_vars.PopAll(1000, &timeout);
+    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
       if (exception_) {
@@ -141,13 +158,24 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          ready_ops.insert(op);
+          if (delayed_vars.find(ready_var) != delayed_vars.end()) {
+            after_delayed_ops.insert(op);
+          } else {
+            ready_ops.insert(op);
+          }
         }
       }
     }
+    if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
+      RunDelayedOps(delayed_ops);
+      delayed_ops.clear();
+      for (auto *op : after_delayed_ops) {
+        ready_ops.insert(op);
+      }
+      after_delayed_ops.clear();
+    }
     // Keep loop until all vars are ready.
   }
-
   ++computation_count_;
 
   auto sync_computation = [&] {
@@ -182,12 +210,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 }
 
 void ThreadedSSAGraphExecutor::RunOp(
-    BlockingQueue<VarHandleBase *> &ready_var_q, details::OpHandleBase *op) {
-  auto op_run = [&ready_var_q, op, this] {
+    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
+  auto op_run = [ready_var_q, op, this] {
     try {
       VLOG(10) << op->Name() << " : " << op->DebugString();
       op->Run(use_event_);
-      ready_var_q.Extend(op->outputs_);
+      running_ops_--;
+      ready_var_q->Extend(op->outputs_);
     } catch (platform::EnforceNotMet ex) {
       exception_.reset(new platform::EnforceNotMet(ex));
     } catch (...) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 2ea57ac8f9..6193b897e4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -14,7 +14,12 @@
 
 #pragma once
 
-#include <chrono>
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
@@ -79,9 +84,11 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ~ThreadedSSAGraphExecutor() {}
 
  private:
-  void RunOp(BlockingQueue<VarHandleBase *> &ready_var_q,
+  void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
              details::OpHandleBase *op);
 
+  void RunDelayedOps(const std::unordered_set<OpHandleBase *> &delayed_ops);
+
  private:
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
@@ -89,6 +96,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   platform::DeviceContextPool fetch_ctxs_;
   const bool use_event_;
   std::unique_ptr<platform::EnforceNotMet> exception_;
+  std::atomic<int> running_ops_;
 
   size_t computation_count_{0};
   size_t max_async_computation{100};
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 577eea92d2..002a6d362f 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 #include <string>
 #include <vector>
@@ -151,6 +152,7 @@ void ParallelExecutor::BCastParamsToGPUs(
 
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
+  platform::RecordBlock b(0);
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 5e0588fa73..33e8d3bf21 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -16,6 +16,7 @@ import core
 import multiprocessing
 import framework
 import executor
+import sys
 
 __all__ = ['ParallelExecutor']
 
@@ -35,7 +36,7 @@ class ParallelExecutor(object):
                 places.append(p)
 
         if num_threads is None:
-            num_threads = min(len(places) * 2, multiprocessing.cpu_count())
+            num_threads = len(places)
 
         startup = framework.default_startup_program()
         main = framework.default_main_program()

From 46f3a39e91fd422f1b6d5cbaadad9a35456eb36a Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sun, 1 Apr 2018 18:05:59 -0700
Subject: [PATCH 50/57] polish and add comments.

---
 .../fluid/framework/details/nccl_all_reduce_op_handle.h   | 2 ++
 .../framework/details/threaded_ssa_graph_executor.cc      | 5 ++++-
 python/paddle/fluid/parallel_executor.py                  | 8 ++++++--
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index 3d61fa79f7..bb92625667 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -37,6 +37,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
   bool IsDelayedOp() override { return true; };
 
  protected:
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 075eed4ecc..32fc9100ab 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -45,7 +45,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   std::unordered_set<VarHandleBase *> pending_vars;
   BlockingQueue<VarHandleBase *> ready_vars;
   std::unordered_set<OpHandleBase *> ready_ops;
-
+  // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
+  // streams from multiple GPUs, it's faster to buffer them and schedule
+  // together since we currently cannot overlap computation and memcpy streams.
+  // Should revisit it if overlapping is available.
   std::unordered_set<OpHandleBase *> delayed_ops;
   std::unordered_set<OpHandleBase *> after_delayed_ops;
   std::unordered_set<VarHandleBase *> delayed_vars;
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 33e8d3bf21..fec7d6899c 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -16,7 +16,6 @@ import core
 import multiprocessing
 import framework
 import executor
-import sys
 
 __all__ = ['ParallelExecutor']
 
@@ -36,7 +35,12 @@ class ParallelExecutor(object):
                 places.append(p)
 
         if num_threads is None:
-            num_threads = len(places)
+            if use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                num_threads = len(places)
+            else:
+                min(len(places) * 2, multiprocessing.cpu_count())
 
         startup = framework.default_startup_program()
         main = framework.default_main_program()

From be1373dcf9c233b6a0c870232adb0e66df64f80c Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sun, 1 Apr 2018 18:11:52 -0700
Subject: [PATCH 51/57] Polish

---
 .../framework/details/nccl_all_reduce_op_handle.h    |  2 +-
 paddle/fluid/framework/details/op_handle_base.h      |  4 +++-
 .../framework/details/threaded_ssa_graph_executor.cc | 12 +++++++-----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index bb92625667..ad14a3c5cb 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -39,7 +39,7 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 
   // Delay and buffer nccl_all_reduce together can significantly increase
   // performance. Disable this feature by returning false.
-  bool IsDelayedOp() override { return true; };
+  bool IsMultiDeviceTransfer() override { return true; };
 
  protected:
   void RunImpl() override;
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 54c2d627ff..d7a541ac4b 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -55,7 +55,9 @@ class OpHandleBase {
 
   void AddOutput(VarHandleBase *out);
 
-  virtual bool IsDelayedOp() { return false; }
+  // If the Op involves data transfer of multiple devices that
+  // will likely block other computations.
+  virtual bool IsMultiDeviceTransfer() { return false; }
 
  protected:
   virtual void RunImpl() = 0;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 32fc9100ab..65fbfb65e1 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -50,7 +50,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   // together since we currently cannot overlap computation and memcpy streams.
   // Should revisit it if overlapping is available.
   std::unordered_set<OpHandleBase *> delayed_ops;
-  std::unordered_set<OpHandleBase *> after_delayed_ops;
+  std::unordered_set<OpHandleBase *> blocked_by_delayed_ops;
   std::unordered_set<VarHandleBase *> delayed_vars;
 
   auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
@@ -119,7 +119,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
   auto run_all_ready_ops = [&] {
     for (auto *op : ready_ops) {
-      if (op->IsDelayedOp()) {
+      if (op->IsMultiDeviceTransfer()) {
         delayed_ops.insert(op);
         delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
         ready_vars.Extend(op->outputs_);
@@ -162,20 +162,22 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         --deps;
         if (deps == 0) {
           if (delayed_vars.find(ready_var) != delayed_vars.end()) {
-            after_delayed_ops.insert(op);
+            blocked_by_delayed_ops.insert(op);
           } else {
             ready_ops.insert(op);
           }
         }
       }
     }
+    // When there are no other ops to schedule, schedule buffered delayed
+    // ops and unblock other ops.
     if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
       RunDelayedOps(delayed_ops);
       delayed_ops.clear();
-      for (auto *op : after_delayed_ops) {
+      for (auto *op : blocked_by_delayed_ops) {
         ready_ops.insert(op);
       }
-      after_delayed_ops.clear();
+      blocked_by_delayed_ops.clear();
     }
     // Keep loop until all vars are ready.
   }

From b123ce88a17ac18dd24ec396d18c1eac7c832442 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 2 Apr 2018 01:10:00 -0700
Subject: [PATCH 52/57] Add enable/disable for delayed ops

---
 .../details/threaded_ssa_graph_executor.cc       | 12 ++++++++----
 .../details/threaded_ssa_graph_executor.h        |  4 +++-
 paddle/fluid/framework/parallel_executor.cc      |  6 +++---
 paddle/fluid/framework/parallel_executor.h       |  6 ++++--
 paddle/fluid/pybind/pybind.cc                    |  4 ++--
 python/paddle/fluid/parallel_executor.py         |  9 +++++++--
 .../tests/unittests/test_parallel_executor.py    | 16 ++++++++++++++--
 7 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 65fbfb65e1..1f96b9dc62 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -23,14 +23,15 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     size_t num_threads, bool use_event,
     const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::unique_ptr<SSAGraph> &&graph)
+    std::unique_ptr<SSAGraph> &&graph, bool allow_op_delay)
     : SSAGraphExecutor(std::move(graph)),
       pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr),
       local_scopes_(local_scopes),
       places_(places),
       fetch_ctxs_(places),
       use_event_(use_event),
-      running_ops_(0) {}
+      running_ops_(0),
+      allow_op_delay_(allow_op_delay) {}
 
 void ThreadedSSAGraphExecutor::RunDelayedOps(
     const std::unordered_set<OpHandleBase *> &delayed_ops) {
@@ -119,7 +120,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
   auto run_all_ready_ops = [&] {
     for (auto *op : ready_ops) {
-      if (op->IsMultiDeviceTransfer()) {
+      if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
         delayed_ops.insert(op);
         delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
         ready_vars.Extend(op->outputs_);
@@ -138,7 +139,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   }
 
   // Step 3. Execution
-  while (!pending_vars.empty()) {
+  while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
     // 1. Run All Ready ops
     run_all_ready_ops();
 
@@ -181,6 +182,9 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     }
     // Keep loop until all vars are ready.
   }
+  PADDLE_ENFORCE(ready_ops.empty());
+  PADDLE_ENFORCE(delayed_ops.empty());
+  PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
   ++computation_count_;
 
   auto sync_computation = [&] {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 6193b897e4..79cfc26b46 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -75,7 +75,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<SSAGraph> &&graph);
+                           std::unique_ptr<SSAGraph> &&graph,
+                           bool allow_op_delay);
 
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
@@ -97,6 +98,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   const bool use_event_;
   std::unique_ptr<platform::EnforceNotMet> exception_;
   std::atomic<int> running_ops_;
+  bool allow_op_delay_;
 
   size_t computation_count_{0};
   size_t max_async_computation{100};
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 002a6d362f..1788514324 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -48,7 +48,7 @@ ParallelExecutor::ParallelExecutor(
     const std::vector<platform::Place> &places,
     const std::unordered_set<std::string> &params,
     const ProgramDesc &startup_program, const ProgramDesc &main_program,
-    const std::string &loss_var_name, Scope *scope)
+    const std::string &loss_var_name, Scope *scope, bool allow_op_delay)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
 
@@ -83,8 +83,8 @@ ParallelExecutor::ParallelExecutor(
   auto graph = builder.Build(main_program);
 
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      num_threads, use_event, member_->local_scopes_, places,
-      std::move(graph)));
+      num_threads, use_event, member_->local_scopes_, places, std::move(graph),
+      allow_op_delay));
 
   // Step 3. Create vars in each scope;
   for (auto *scope : member_->local_scopes_) {
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 503efa2e44..964b476234 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,8 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <future>
+#include <string>
 #include <unordered_set>
+#include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -37,7 +38,8 @@ class ParallelExecutor {
                             const std::unordered_set<std::string>& params,
                             const ProgramDesc& startup_program,
                             const ProgramDesc& main_program,
-                            const std::string& loss_var_name, Scope* scope);
+                            const std::string& loss_var_name, Scope* scope,
+                            bool allow_op_delay);
 
   void Run(const std::vector<std::string>& fetch_tensors,
            const std::string& fetched_var_name = "fetched_var");
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e1b1bbec97..b0a3f06a88 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -504,10 +504,10 @@ All parameter, weight, gradient are variables in Paddle.
               const std::unordered_set<std::string> &params,
               const ProgramDesc &startup_program,
               const ProgramDesc &main_program, const std::string &loss_var_name,
-              Scope *scope) {
+              Scope *scope, bool allow_op_delay) {
              new (&self) ParallelExecutor(num_threads, use_event, places,
                                           params, startup_program, main_program,
-                                          loss_var_name, scope);
+                                          loss_var_name, scope, allow_op_delay);
            })
       .def("run", &ParallelExecutor::Run);
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index fec7d6899c..a2c830b3c9 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -21,7 +21,11 @@ __all__ = ['ParallelExecutor']
 
 
 class ParallelExecutor(object):
-    def __init__(self, loss_name, use_cuda, num_threads=None):
+    def __init__(self,
+                 loss_name,
+                 use_cuda,
+                 num_threads=None,
+                 allow_op_delay=False):
         places = []
         if use_cuda:
             for i in xrange(core.get_cuda_device_count()):
@@ -57,7 +61,8 @@ class ParallelExecutor(object):
             startup.desc,
             main.desc,
             loss_name,
-            scope)
+            scope,
+            allow_op_delay)
         self.scope = scope
 
     def run(self, fetch_list):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 95d0f9da47..60130298af 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -184,7 +184,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   method,
                                   memory_opt=True,
                                   iter=10,
-                                  batch_size=None):
+                                  batch_size=None,
+                                  allow_op_delay=False):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -194,7 +195,10 @@ class TestParallelExecutorBase(unittest.TestCase):
             if memory_opt:
                 fluid.memory_optimize(main)
 
-            exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
+            exe = fluid.ParallelExecutor(
+                loss_name=loss.name,
+                use_cuda=True,
+                allow_op_delay=allow_op_delay)
             if batch_size is not None:
                 batch_size *= fluid.core.get_cuda_device_count()
             begin = time.time()
@@ -236,9 +240,11 @@ class TestMNIST(TestParallelExecutorBase):
 
     def test_simple_fc(self):
         self.check_network_convergence(simple_fc_net)
+        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
 
     def test_batchnorm_fc(self):
         self.check_network_convergence(fc_with_batchnorm)
+        self.check_network_convergence(fc_with_batchnorm, allow_op_delay=True)
 
 
 class TestResnet(TestParallelExecutorBase):
@@ -268,6 +274,12 @@ class TestResnet(TestParallelExecutorBase):
                 SE_ResNeXt152, batch_size=batch_size),
             iter=20,
             batch_size=batch_size)
+        self.check_network_convergence(
+            functools.partial(
+                SE_ResNeXt152, batch_size=batch_size),
+            iter=20,
+            batch_size=batch_size,
+            allow_op_delay=True)
 
 
 class ModelHyperParams(object):

From b25a9c488d3d6945157adeeac2c504ca3be977da Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 3 Apr 2018 00:16:39 -0700
Subject: [PATCH 53/57] Reduce test size to avoid GPU memory issue.

---
 .../paddle/fluid/tests/unittests/CMakeLists.txt  |  2 ++
 .../tests/unittests/test_parallel_executor.py    | 16 ++++++++--------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0ad273c716..1b2d29a47f 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -29,6 +29,7 @@ function(py_test_modules TARGET_NAME)
 endfunction()
 
 # test time consuming OPs in a separate process for expliot parallism
+list(REMOVE_ITEM TEST_OPS test_parallel_executor)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
 list(REMOVE_ITEM TEST_OPS test_mul_op)
@@ -64,6 +65,7 @@ else()
 endif(WITH_FAST_BUNDLE_TEST)
 
 # tests with high overhead
+py_test_modules(test_parallel_executor MODULES test_parallel_executor)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
 py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)
 py_test_modules(test_mul_op MODULES test_mul_op)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 60130298af..f132a754a2 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -135,18 +135,18 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-def SE_ResNeXt152(batch_size=4):
+def SE_ResNeXt152Small(batch_size=2):
     img = fluid.layers.fill_constant(
         shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
     label = fluid.layers.fill_constant(
         shape=[batch_size, 1], dtype='int64', value=0.0)
 
     conv = conv_bn_layer(
-        input=img, num_filters=64, filter_size=3, stride=2, act='relu')
+        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
     conv = conv_bn_layer(
-        input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
     conv = conv_bn_layer(
-        input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
     conv = fluid.layers.pool2d(
         input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
 
@@ -226,7 +226,7 @@ class TestMNIST(TestParallelExecutorBase):
     def setUpClass(cls):
         # Convert mnist to recordio file
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            reader = paddle.batch(mnist.train(), batch_size=32)
+            reader = paddle.batch(mnist.train(), batch_size=4)
             feeder = fluid.DataFeeder(
                 feed_list=[  # order is image and label
                     fluid.layers.data(
@@ -268,15 +268,15 @@ class TestResnet(TestParallelExecutorBase):
 
     def test_resnet(self):
         import functools
-        batch_size = 4
+        batch_size = 2
         self.check_network_convergence(
             functools.partial(
-                SE_ResNeXt152, batch_size=batch_size),
+                SE_ResNeXt152Small, batch_size=batch_size),
             iter=20,
             batch_size=batch_size)
         self.check_network_convergence(
             functools.partial(
-                SE_ResNeXt152, batch_size=batch_size),
+                SE_ResNeXt152Small, batch_size=batch_size),
             iter=20,
             batch_size=batch_size,
             allow_op_delay=True)

From cf251eb8cf3051f21306d73c73a48b0f2443ef8d Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 3 Apr 2018 01:07:19 -0700
Subject: [PATCH 54/57] shrink test size

---
 .../paddle/fluid/tests/unittests/test_parallel_executor.py  | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index f132a754a2..a79e4b3e18 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -274,12 +274,6 @@ class TestResnet(TestParallelExecutorBase):
                 SE_ResNeXt152Small, batch_size=batch_size),
             iter=20,
             batch_size=batch_size)
-        self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt152Small, batch_size=batch_size),
-            iter=20,
-            batch_size=batch_size,
-            allow_op_delay=True)
 
 
 class ModelHyperParams(object):

From 3f3ecae164c13fafc8f5066c53d00eda2a925d45 Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Tue, 3 Apr 2018 17:27:05 +0800
Subject: [PATCH 55/57] Fix tables display error

---
 benchmark/cluster/README.md                   | 148 ++++++++++++++--
 benchmark/cluster/vgg16/README.md             | 160 +++++++++++++++---
 .../design/algorithm/parameter_average.md     |   2 +-
 doc/fluid/design/concepts/README.md           |  32 +++-
 doc/fluid/design/concepts/block.md            |  61 +++++--
 .../concepts/functions_operators_layers.md    |  40 ++++-
 doc/fluid/design/concepts/lod_tensor.md       |  38 ++++-
 doc/fluid/design/concepts/var_desc.md         |  25 ++-
 .../concurrent/concurrent_programming.md      |  46 +++--
 doc/fluid/design/concurrent/csp.md            |  47 +++--
 doc/fluid/design/modules/python_api.md        |  33 +++-
 doc/fluid/design/motivation/fluid.md          |  36 +++-
 .../design/motivation/refactorization.md      |  36 +++-
 doc/fluid/design/network/deep_speech_2.md     | 123 +++++++++++---
 doc/fluid/dev/new_op_cn.md                    |  33 +++-
 doc/fluid/dev/new_op_en.md                    |  29 +++-
 doc/fluid/dev/releasing_process.md            | 126 +++++++++++++-
 .../concepts/save_model/model_format.md       |  68 ++++++--
 .../howto/cluster/fluid_cluster_train_cn.md   |  58 +++++--
 .../howto/optimization/cpu_profiling_cn.md    |  42 ++++-
 .../howto/optimization/cpu_profiling_en.md    |  35 ++++
 21 files changed, 1043 insertions(+), 175 deletions(-)

diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
index b619613ea7..64816098a5 100644
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@@ -36,11 +36,41 @@
 - Trainer Count: 100
 - Metrics: mini-batch / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
 
 ### Measure the Performance for Different PServer Count
 
@@ -48,11 +78,41 @@
 - Batch Size: 64
 - Metrics: mini-batch / sec
 
-| PServer Count | 10 | 20 | 40 | 60 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+<table>
+<thead>
+<tr>
+<th>PServer Count  </th>
+<th>10</th>
+<th>20</th>
+<th>40 </th>
+<th>60</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
 
 ### Measure Parallel Efficiency By Increasing Trainer Count
 
@@ -67,11 +127,69 @@ The parallel efficiency is:
 
 $E = \div(S, N)$
 
-| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
-| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
-| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>Trainer Counter  </th>
+<th>1</th>
+<th>10</th>
+<th>20 </th>
+<th>30</th>
+<th>40</th>
+<th>50</th>
+<th>60 </th>
+<th>70</th>
+<th>80</th>
+<th>90</th>
+<th>100 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
+
 
 ## Reproduce the benchmark
 
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index cd681a1a28..d56a912b9b 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 
 - Metrics: samples / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
-| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
-| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 15.44 </td>
+<td> 16.32 </td>
+<td> 16.74 </td>
+<td> 16.79 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td> 15.97 </td>
+<td> 17.04 </td>
+<td> 17.60 </td>
+<td> 17.83 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> 9.09 </td>
+<td> 9.10 </td>
+<td> 9.24 </td>
+<td> 8.66 </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Different Batch Size
 
@@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Trainer Count: 20
 - Metrics: samples / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
-| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
-| TensorFlow | - | - | - | - |
-
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 190.20 </td>
+<td> 222.15 </td>
+<td> 247.40 </td>
+<td> 258.18 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td> 170.96 </td>
+<td> 233.71 </td>
+<td> 256.14 </td>
+<td> 329.23 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
 
 ### Accelerate Rate
 
@@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples / sec
 
-| Trainer Count | 20 | 40 | 80 | 100 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
-| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
-| TensorFlow | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>Trainer Count </th>
+<th>20</th>
+<th>40</th>
+<th>80</th>
+<th>100</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 263.29 (78.64%) </td>
+<td> 518.80 (77.47%) </td>
+<td> 836.26 (62.44%) </td>
+<td> 1019.29 (60.89%) </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2 (need more tests)   </td>
+<td> 326.85 (92.85%) </td>
+<td> 534.58 (75.93%) </td>
+<td> 853.30 (60.60%) </td>
+<td> 1041.99 (59.20%) </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Different Pserver Count
 
@@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples/ sec
 
-| PServer Count | 3 | 6 |10 | 20 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
-| TensorFlow | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>PServer Count </th>
+<th>3</th>
+<th>6</th>
+<th>10</th>
+<th>20</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid(should fix in next PR) </td>
+<td> 589.1 </td>
+<td> 592.6 </td>
+<td> 656.4 </td>
+<td> 655.8 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2 (need more tests)   </td>
+<td> 593.4 </td>
+<td> 791.3 </td>
+<td> 729.7 </td>
+<td> 821.7 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
+
 
 *The performance gap between Fuild and v2 comes from the network interference.*
 
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
index 2c4edee9fe..53d601d3a9 100644
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -7,7 +7,7 @@ Polyak and Juditsky (1992) showed that the test performance of simple average of
 
 Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
 
-<img src="./images/asgd.gif" align="center"/><br/>
+![](./images/asgd.gif)
 
 We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
 
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
index ed3f5aab28..8ded0ad22f 100644
--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
@@ -6,11 +6,33 @@ Here are some initial thoughts. Your comments are welcome!
 
 I think we need only the following few CMake functions to make a project description mean and clean:
 
-| C++ | CUDA C++ | Go |
-|---|---|---|
-| cc_library | nv_library | go_library |
-| cc_binary | nv_binary | go_binary |
-| cc_test | nv_test | go_test |
+<table>
+<thead>
+<tr>
+<th>C++</th>
+<th>CUDA C++</th>
+<th>Go</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cc_library </td>
+<td>nv_library </td>
+<td>go_library </td>
+</tr>
+<tr>
+<td>cc_binary </td>
+<td>nv_binary </td>
+<td>go_binary </td>
+</tr>
+<tr>
+<td> cc_test </td>
+<td> nv_test </td>
+<td> go_test </td>
+</tr>
+</tbody>
+</table>
+
 
 - The `_library` functions generate  .a files from source code.
 - The `_binary` functions generate executable binary files.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
index 907a2def55..3b626bd89c 100644
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
@@ -14,11 +14,29 @@ In programming languages, a block is a pair of curly braces that includes local
 
 Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
 
-| programming languages | PaddlePaddle          |
-|-----------------------|-----------------------|
-| for, while loop       | RNN, WhileOp          |
-| if, if-else, switch   | IfElseOp, SwitchOp    |
-| sequential execution  | a sequence of layers  |
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>for, while loop </td>
+<td>RNN, WhileOp </td>
+</tr>
+<tr>
+<td>if, if-else, switch </td>
+<td>IfElseOp, SwitchOp </td>
+</tr>
+<tr>
+<td>sequential execution </td>
+<td>a sequence of layers </td>
+</tr>
+</tbody>
+</table>
+
 
 A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
 
@@ -26,12 +44,33 @@ A key difference is that a C++ program describes a one pass computation, whereas
 
 The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
 
-| programming languages | PaddlePaddle                    |
-|-----------------------|---------------------------------|
-| stack                 | scope hierarchy                 |
-| stack frame           | scope                           |
-| push at entering block| push at entering block          |
-| pop at leaving block  | destroy when minibatch completes|
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>stack </td>
+<td>scope hierarchy </td>
+</tr>
+<tr>
+<td>stack frame  </td>
+<td>scope </td>
+</tr>
+<tr>
+<td>push at entering block </td>
+<td>push at entering block </td>
+</tr>
+<tr>
+<td>pop at leaving block </td>
+<td>destroy when minibatch completes </td>
+</tr>
+</tbody>
+</table>
+
 
 1. In traditional programs:
 
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
index 984b59f4c6..30bc488a18 100644
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -86,12 +86,40 @@ def layer.fc(X):
 
 We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
 
-
-| C++ functions/functors | mul          | add          |             |          |
-|------------------------|--------------|--------------|-------------|----------|
-| C++ operator class     | mulOp        | addOp        | FCOp        |          |
-| Python binding         | operator.mul | operator.add | operator.fc |          |
-| Python function        |              |              |             | layer.fc |
+<table>
+<thead>
+<tr>
+<th>C++ functions/functors</th>
+<th>mul</th>
+<th>add</th>
+<th></th>
+<th></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>C++ operator class </td>
+<td>mulOp</td>
+<td>addOp </td>
+<td>FCOp </td>
+<td></td>
+</tr>
+<tr>
+<td>Python binding  </td>
+<td>operator.mul</td>
+<td> operator.add </td>
+<td>operator.fc </td>
+<td></td>
+</tr>
+<tr>
+<td>Python function   </td>
+<td></td>
+<td></td>
+<td> </td>
+<td>layer.fc</td>
+</tr>
+</tbody>
+</table>
 
 
 This is how we differentiate layer and operators in PaddlePaddle:
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index 10a8a7867f..a88292e788 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -2,12 +2,38 @@
 
 Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
 
-|                       | TensorFlow | PaddlePaddle |
-|-----------------------|------------|--------------|
-| RNN                   | Support    | Support      |
-| recursive RNN         | Support    | Support      |
-| padding zeros         | Must       | No need      |
-| blob data type        | Tensor     | LoDTensor    |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>recursive RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>padding zeros </td>
+<td> Must </td>
+<td>No need </td>
+</tr>
+<tr>
+<td> blob data type </td>
+<td> Tensor</td>
+<td> LoDTensor </td>
+</tr>
+</tbody>
+</table>
+
 
 PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
 
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index fcba08c07f..6750323c01 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -10,10 +10,27 @@ PaddlePaddle uses proto message to describe compile time program because :
 
 The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
 
-| |compile time|runtime|
-|---|---|---|
-|Data|VarDesc(proto)|Variable(cpp)|
-|Operation|OpDesc(proto)|Operator(cpp)|
+<table>
+<thead>
+<tr>
+<th></th>
+<th>compile time</th>
+<th>runtime</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data </td>
+<td>VarDesc(proto) </td>
+<td>Variable(cpp) </td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>OpDesc(proto) </td>
+<td>Operator(cpp) </td>
+</tr>
+</tbody>
+</table>
 
 
 ## Definition of VarType
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
index f022e67fd3..6460216606 100644
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -10,12 +10,38 @@ The answer relies on the fact that a `ProgramDesc` is similar to an abstract syn
 
 The following table compares concepts in Fluid and Go
 
-| Go | Fluid |
-|----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
-| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
-| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
-| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Go</th>
+<th>Fluid</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>user-defined functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
+</tr>
+<tr>
+<td>control-flow and built-in functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
+</tr>
+<tr>
+<td>goroutines, channels </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
+</tr>
+<tr>
+<td>runtime </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
+</tr>
+</tbody>
+</table>
+
 
 ## An Example Concurrent Program
 
@@ -77,11 +103,11 @@ message ProgramDesc {
       read(output = X)
       kube_get_workers_addrs(output = L)
       Y = tensor_array(len(L))
-      parallel_for(input = X, output = Y, 
+      parallel_for(input = X, output = Y,
                    attrs = {L, block_id(1)}) # referring to block 1
     ]
   }
-  
+
   block[1] = Block {
     parent = 0,
     vars = [x, y, index],
@@ -102,7 +128,7 @@ func main() {  //// block 0
   X = fluid.read(...)
   L = fluid.k8s.get_worker_addrs()
   Y = fluid.tensor_array(len(L))
-  fluid.parallel_for(X, L, 
+  fluid.parallel_for(X, L,
                      func(index int) {  //// block 1
                        x = X[index]
                        fluid.send(L[index], x)
@@ -116,7 +142,7 @@ An explanation of the above program:
 
 - `fluid.k8s` is a package that provides access to Kubernetes API.  
 - `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
-- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
 
   1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
   2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
index 10d936860f..66d19f44ba 100644
--- a/doc/fluid/design/concurrent/csp.md
+++ b/doc/fluid/design/concurrent/csp.md
@@ -13,14 +13,41 @@ Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously exe
 
 There were many concurrent programming models, implemented in various forms:
 
-| concurrent programming model | implementation |
-|-----|-----|
-| mutex | types and functions in standard libraries |
-| semaphore | types and functions in standard libraries |
-| communicating sequential processes (CSP) | Go programming language |
-| actor model | Erlang programming language |
-| message passing | MPI |
-| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+<table>
+<thead>
+<tr>
+<th>concurrent programming model</th>
+<th>implementation</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>mutex </td>
+<td>types and functions in standard libraries </td>
+</tr>
+<tr>
+<td>semaphore </td>
+<td> types and functions in standard libraries </td>
+</tr>
+<tr>
+<td> communicating sequential processes (CSP)  </td>
+<td> Go programming language </td>
+</tr>
+<tr>
+<td> actor model  </td>
+<td> Erlang programming language </td>
+</tr>
+<tr>
+<td> message passing  </td>
+<td> MPI </td>
+</tr>
+<tr>
+<td> bulk synchronous parallel (BSP)   </td>
+<td> Pregel distributed programming framework </td>
+</tr>
+</tbody>
+</table>
+
 
 Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
 
@@ -118,9 +145,9 @@ There are four types of actions with a channel:
    ```go
    close(ch)
    ```
-   
+
    Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
-   
+
 There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
 
 1. A send to a nil channel blocks forever
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
index 73f6d7b90c..f83ad3b6a4 100644
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -2,12 +2,33 @@
 
 Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
 
-| Python classes | Protobuf messages |
-| --- | --- |
-| Program | ProgramDesc |
-| Block | BlockDesc |
-| Operator | OpDesc |
-| Variable | VarDesc |
+<table>
+<thead>
+<tr>
+<th>Python classes</th>
+<th>Protobuf messages</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Program </td>
+<td>ProgramDesc </td>
+</tr>
+<tr>
+<td>Block  </td>
+<td>BlockDesc </td>
+</tr>
+<tr>
+<td>Operator </td>
+<td>OpDesc </td>
+</tr>
+<tr>
+<td>Variable </td>
+<td>VarDesc </td>
+</tr>
+</tbody>
+</table>
+
 
 Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
 
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index 110b7d78bf..5e147f8263 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -10,11 +10,37 @@ Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution
 
 Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
 
-| Existed since | model as sequence of layers | model as graph of operators | No model |
-|--|--|--|--|
-| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
-| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
-| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+<table>
+<thead>
+<tr>
+<th>Existed since</th>
+<th>model as sequence of layers</th>
+<th>model as graph of operators</th>
+<th>No model</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>2013 </td>
+<td>Caffe, Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>2015 </td>
+<td> </td>
+<td>TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td>   </td>
+<td> PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid</td>
+</tr>
+</tbody>
+</table>
+
 
 From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
 
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index 7c39fabcc6..f199cc892f 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -36,11 +36,37 @@ At compile time, the Python program generates a protobuf message representation
 
 At runtime, the C++ program realizes the graph and runs it.
 
-| | Representation (protobuf messages) | Realization (C++ class objects) |
-|---|---|---|
-|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
-|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
-|Block|BlockDesc|Block|
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Representation (protobuf messages)</th>
+<th>Realization (C++ class objects) </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107">VarDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24">Variable</a></td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35">OpDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64">Operator</a></td>
+</tr>
+<tr>
+<td>Block </td>
+<td>BlockDesc </td>
+<td>Block </td>
+
+</tbody>
+</table>
+
 
 The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
 
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
index af0c6ef36f..7f5dcf55f9 100644
--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -1,4 +1,4 @@
-# DeepSpeech2 on PaddlePaddle: Design Doc 
+# DeepSpeech2 on PaddlePaddle: Design Doc
 
 We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
 
@@ -68,11 +68,33 @@ We roughly break down the project into 14 tasks:
 
 Tasks parallelizable within phases:
 
-Roadmap     | Description                               | Parallelizable Tasks 
------------ | :------------------------------------     | :--------------------
-Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
-Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
-Phase III   | Documentations                            | *Task13* ~ *Task14*
+<table>
+<thead>
+<tr>
+<th>Roadmap</th>
+<th>Description</th>
+<th> Parallelizable Tasks</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Phase I </td>
+<td>Simplified model & components </td>
+<td>Task 1 ~ Task 8</td>
+</tr>
+<tr>
+<td>Phase II </td>
+<td> Standard model & benchmarking & profiling</td>
+<td>Task 9 ~ Task 12 </td>
+</tr>
+<tr>
+<td>Phase III </td>
+<td> Documentations</td>
+<td> Task13 ~ Task14 </td>
+</tr>
+</tbody>
+</table>
+
 
 Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
 
@@ -102,37 +124,82 @@ We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar
 
 Key ingredients about the layers:
 
-- **Data Layers**: 
+- **Data Layers**:
    - Frame sequences data of audio **spectrogram** (with FFT).
-   - Token sequences data of **transcription** text (labels). 
+   - Token sequences data of **transcription** text (labels).
    - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
-- **2D Convolution Layers**: 
+- **2D Convolution Layers**:
    - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
    - With striding for only the first convlution layer.
    - No pooling for all convolution layers.
-- **Uni-directional RNNs** 
+- **Uni-directional RNNs**
 	- Uni-directional + row convolution: for low-latency inference.
 	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
 - **Row convolution**:
 	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
-	- Not nessesary if with bi-direcitional RNNs. 
+	- Not nessesary if with bi-direcitional RNNs.
 	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
 - **Batch Normalization Layers**:
    - Added to all above layers (except for data and loss layer).
    - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
- 
-
-Required Components                     | PaddlePaddle Support                      | Need to Develop
-:-------------------------------------  | :--------------------------------------   | :-----------------------
-Data Layer I (Spectrogram)	            | Not supported yet.                        |  TBD (Task 3)
-Data Layer II (Transcription)           | `paddle.data_type.integer_value_sequence` | -
-2D Convolution Layer                    | `paddle.layer.image_conv_layer`           | -
-DataType Converter (vec2seq)            | `paddle.layer.block_expand`               | -
-Bi-/Uni-directional RNNs                | `paddle.layer.recurrent_group`            | -
-Row Convolution Layer                   | Not supported yet.                        | TBD (Task 4)
-CTC-loss Layer                          | `paddle.layer.warp_ctc`                   | -
-Batch Normalization Layer               | `paddle.layer.batch_norm`                 | -
-CTC-Beam search                         | Not supported yet.                        | TBD (Task 6)
+   
+<table>
+<thead>
+<tr>
+<th>Required Components</th>
+<th> PaddlePaddle Support</th>
+<th> Need to Develop</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data Layer I (Spectrogram) </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 3)</td>
+</tr>
+<tr>
+<td>Data Layer II (Transcription)  </td>
+<td> paddle.data_type.integer_value_sequence</td>
+<td> - </td>
+</tr>
+<tr>
+<td>2D Convolution Layer </td>
+<td> paddle.layer.image_conv_layer</td>
+<td> - </td>
+</tr>
+<tr>
+<td>DataType Converter (vec2seq)</td>
+<td> paddle.layer.block_expand</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Bi-/Uni-directional RNNs </td>
+<td>paddle.layer.recurrent_group</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Row Convolution Layer </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 4)</td>
+</tr>
+<tr>
+<td>CTC-loss Layer </td>
+<td>paddle.layer.warp_ctc</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Batch Normalization Layer </td>
+<td>paddle.layer.batch_norm</td>
+<td> - </td>
+</tr>
+<tr>
+<td>CTC-Beam search </td>
+<td>Not supported yet.</td>
+<td> TBD (Task 6) </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Row Convolution
 
@@ -145,14 +212,14 @@ TODO by Assignees
 Figure 2. Algorithm for CTC Beam Search Decoder.
 </div>
 
-- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
-   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
    - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
 - An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
 - Such external scorer consists of language model, word count or any other custom scorers.
 - The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
-- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
- 
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
 
 ## Future Work
 
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 9299658567..0c3f88d9c3 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -26,13 +26,32 @@
 
 依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorWithKernel`，后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
 
-
- 内容            | 定义位置
---------------  | :----------------------
-OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
-Op定义           | `.cc`文件
-Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake定义 </td>
+<td>`.cc`文件，Backward Op不需要定义OpProtoMake </td>
+</tr>
+<tr>
+<td>Op定义 </td>
+<td> `.cc`文件</td>
+</tr>
+<tr>
+<td>Kernel实现 </td>
+<td> CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。</td>
+</tr>
+<tr>
+<td>注册Op </td>
+<td> Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中</td>
+</tr>
+</tbody>
+</table>
 
 
 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
index da8b1bdd10..a566a09131 100644
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -33,6 +33,33 @@ Op definition           | `.cc` files
 Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
 Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 
+<table>
+<thead>
+<tr>
+<th>Information</th>
+<th> Where is it defined</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake definition </td>
+<td> `.cc`files, Backward Op does not need an OpProtoMake interface. </td>
+</tr>
+<tr>
+<td>Op definition  </td>
+<td> `.cc` files</td>
+</tr>
+<tr>
+<td>Kernel implementation  </td>
+<td> The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.</td>
+</tr>
+<tr>
+<td>Registering the Op  </td>
+<td> Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.</td>
+</tr>
+</tbody>
+</table>
+
 
 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
 
@@ -279,7 +306,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
 
       def test_check_output(self):
           self.check_output()
-          
+
       def test_check_grad_normal(self):
           self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process.md
index b978726109..addd474b69 100644
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process.md
@@ -66,7 +66,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
 	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
 	* 当功能分支开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
-		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。 
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。
 
 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
 
@@ -78,13 +78,137 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
 
+
 | | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
+
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+
 | API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
+
 | API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
+
 | `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
+
 | `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
+
 | API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
+
 | API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
+
 | `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
+
 | `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>新手入门章节 </th>
+<th> 识别数字</th>
+<th> 图像分类</th>
+<th>词向量</th>
+<th> 情感分析</th>
+<th>语意角色标注</th>
+<th> 机器翻译</th>
+<th>个性化推荐</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md
index e29129fddf..1f12ba0497 100644
--- a/doc/fluid/getstarted/concepts/save_model/model_format.md
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
@@ -4,30 +4,70 @@
 
 A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
 
-As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. 
+As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters.
 
 ## Implementation
 
-The topology is saved as a plain text in a detailed self-contain protobuf file. 
+The topology is saved as a plain text in a detailed self-contain protobuf file.
 
 The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
 
-As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
 
 The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
 
-|field name  | type | description |
-| --- | --- | --- |
-| version | uint32_t | Version of saved file. Always 0 now. |
-| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
-| tensor desc | void* | TensorDesc protobuf binary message |
-| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
-| lod_level | uint64_t | Level of LoD |
-| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
-| data of lod[0] | uint64_t*  | [Optional] lod[0].data() |
-| ... | ... | ... |
-
+<table>
+<thead>
+<tr>
+<th>field name</th>
+<th>type </th>
+<th>description </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> version</td>
+<td> uint32_t </td>
+<td> Version of saved file. Always 0 now.</td>
+</tr>
 
+<tr>
+<td> tensor desc length  </td>
+<td> uint32_t </td>
+<td> TensorDesc(Protobuf message) length in bytes. </td>
+</tr>
+<tr>
+<td>tensor desc </td>
+<td> void*</td>
+<td> TensorDesc protobuf binary message </td>
+</tr>
+<tr>
+<td> tensor data </td>
+<td> void* </td>
+<td> Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` </td>
+</tr>
+<tr>
+<td> lod_level</td>
+<td> uint64_t </td>
+<td> Level of LoD </td>
+</tr>
+<tr>
+<td> length of lod[0] </td>
+<td> uint64_t </td>
+<td> [Optional] length of lod[0] in bytes. </td>
+</tr>
+<tr>
+<td> data of lod[0] </td>
+<td> uint64_t*   </td>
+<td> [Optional] lod[0].data() </td>
+</tr>
+<tr>
+<td>... </td>
+<td> ... </td>
+<td> ... </td>
+</tr>
+</tbody>
+</table>
 
 ## Summary
 
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
index 1b6f767869..b99b90056b 100644
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -65,10 +65,10 @@ exit(1)
 
 **因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
 
-### 分布式训练 
+### 分布式训练
 Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
 ```python
-optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) 
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 ```
 将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
 ```python
@@ -99,15 +99,51 @@ for pass_id in range(100):
 ### 分布式训练脚本运行说明
 分布式任务的运行需要将表格中说明的多个参数进行赋值:
 
-| 参数名 | 值类型 | 说明 | 示例 |
-|:-------------|:------|:---------------------------------------|:-------------|
-| trainer_id | int | 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 | 0/1/2/3 |
-| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
-| trainers | int | 训练节点的总个数，>0的数字 | 4 |
-| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
-| training_role | str | 节点角色， TRAINER/PSERVER | PSERVER |
-
-**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下： 
+<table>
+<thead>
+<tr>
+<th>参数名</th>
+<th> 值类型</th>
+<th>说明</th>
+<th> 示例</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>trainer_id </td>
+<td> int</td>
+<td> 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 </td>
+<td> 0/1/2/3  </td>
+</tr>
+<tr>
+<td>pservers </td>
+<td> str</td>
+<td> parameter server 列表 </td>
+<td> 127.0.0.1:6710,127.0.0.1:6711 </td>
+</tr>
+<tr>
+<td>trainers </td>
+<td>int </td>
+<td> 训练节点的总个数，>0的数字 </td>
+<td> 4 </td>
+</tr>
+<tr>
+<td> server_endpoint</td>
+<td> str </td>
+<td> 当前所起的服务节点的IP:PORT </td>
+<td> 127.0.0.1:8789 </td>
+</tr>
+<tr>
+<td> training_role</td>
+<td>str </td>
+<td> 节点角色， TRAINER/PSERVER </td>
+<td> PSERVER </td>
+</tr>
+</tbody>
+</table>
+
+
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下：
 
 ```python
 t = fluid.DistributeTranspiler()
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
index 17f895573a..8266dec3c6 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -42,14 +42,40 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 
 每一列的含义是:
 
-| 列名 | 含义 |
-| --- | --- |
-| ncalls | 函数的调用次数 |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
-| percall | tottime的每次调用平均时间 |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
-| percall | cumtime的每次调用平均时间 |
-| filename:lineno(function) | 文件名, 行号，函数名 |
+<table>
+<thead>
+<tr>
+<th>列名</th>
+<th>含义 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> 函数的调用次数</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> 函数总时间。包含这个函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> 文件名, 行号，函数名 </td>
+</tr>
+</tbody>
+</table>
 
 
 ### 寻找性能瓶颈
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index abe4493c17..4447db252f 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -66,6 +66,41 @@ each column is as follows:
 | percall | cumtime divided by ncalls |
 | filename:lineno(function) | where the function is defined |
 
+<table>
+<thead>
+<tr>
+<th>column</th>
+<th>meaning </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> the number of calls into a function</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> the total execution time of the function, not including the execution time of other functions called by the function</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime divided by ncalls</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> the total execution time of the function, including the execution time of other functions being called</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime divided by ncalls</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> where the function is define </td>
+</tr>
+</tbody>
+</table>
+
 ### Identify Performance Bottlenecks
 
 Usually, `tottime` and the related `percall` time is what we want to

From 6a7cba417432a750391c832ad18e805bbf03ab3e Mon Sep 17 00:00:00 2001
From: weixing <wx_crome@163.com>
Date: Tue, 3 Apr 2018 17:31:33 +0800
Subject: [PATCH 56/57] Update cpu_profiling_en.md

---
 doc/fluid/howto/optimization/cpu_profiling_en.md | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index 4447db252f..e95556dd60 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -57,15 +57,6 @@ port, we will see the output like the following:
 where each line corresponds to Python function, and the meaning of
 each column is as follows:
 
-| column | meaning |
-| --- | --- |
-| ncalls | the number of calls into a function |
-| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
-| percall | tottime divided by ncalls |
-| cumtime | the total execution time of the function, including the execution time of other functions being called |
-| percall | cumtime divided by ncalls |
-| filename:lineno(function) | where the function is defined |
-
 <table>
 <thead>
 <tr>

From e81b140437395c6d137adb2a5664ea4390d3f3a0 Mon Sep 17 00:00:00 2001
From: weixing <wx_crome@163.com>
Date: Tue, 3 Apr 2018 17:32:10 +0800
Subject: [PATCH 57/57] Update releasing_process.md

---
 doc/fluid/dev/releasing_process.md | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process.md
index addd474b69..0810765b85 100644
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process.md
@@ -78,27 +78,6 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
 
-
-| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
-
-| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-
-| API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
-
-| API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
-
-| `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
-
-| `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
-
-| API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-
-| API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
-
-| `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-
-| `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
-
 <table>
 <thead>
 <tr>

Trainer Counter	1	10	20	30	40	50	60	70	80	90	100
PaddlePaddle Fluid	-	-	-	-	-	-	-	-	-	-	-
PaddlePaddle v2	-	-	-	-	-	-	-	-	-	-	-
TensorFlow	-	-	-	-	-	-	-	-	-	-	-
Batch Size	32	64	128	256
PaddlePaddle Fluid	15.44	16.32	16.74	16.79
PaddlePaddle v2	15.97	17.04	17.60	17.83
TensorFlow	9.09	9.10	9.24	8.66
Batch Size	32	64	128	256
PaddlePaddle Fluid	190.20	222.15	247.40	258.18
PaddlePaddle v2	170.96	233.71	256.14	329.23
TensorFlow	-	-	-	-
Trainer Count	20	40	80	100
PaddlePaddle Fluid	263.29 (78.64%)	518.80 (77.47%)	836.26 (62.44%)	1019.29 (60.89%)
PaddlePaddle v2 (need more tests)	326.85 (92.85%)	534.58 (75.93%)	853.30 (60.60%)	1041.99 (59.20%)
TensorFlow	-	-	-	-
PServer Count	3	6	10	20
PaddlePaddle Fluid(should fix in next PR)	589.1	592.6	656.4	655.8
PaddlePaddle v2 (need more tests)	593.4	791.3	729.7	821.7
TensorFlow	-	-	-	-
C++	CUDA C++	Go
cc_library	nv_library	go_library
cc_binary	nv_binary	go_binary
cc_test	nv_test	go_test
programming languages	PaddlePaddle
for, while loop	RNN, WhileOp
if, if-else, switch	IfElseOp, SwitchOp
sequential execution	a sequence of layers
programming languages	PaddlePaddle
stack	scope hierarchy
stack frame	scope
push at entering block	push at entering block
pop at leaving block	destroy when minibatch completes
C++ functions/functors	mul	add
C++ operator class	mulOp	addOp	FCOp
Python binding	operator.mul	operator.add	operator.fc
Python function				layer.fc
	TensorFlow	PaddlePaddle
RNN	Support	Support
recursive RNN	Support	Support
padding zeros	Must	No need
blob data type	Tensor	LoDTensor