From 48556ba3bbf228cbe418c3a0634df9f7c147b211 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 11 Oct 2017 12:39:53 +0000
Subject: [PATCH 01/54] add block_expand_op

---
 paddle/operators/block_expand_op.cc | 80 ++++++++++++++++++++++++++
 paddle/operators/block_expand_op.cu |  0
 paddle/operators/block_expand_op.h  | 89 +++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+)
 create mode 100644 paddle/operators/block_expand_op.cc
 create mode 100644 paddle/operators/block_expand_op.cu
 create mode 100644 paddle/operators/block_expand_op.h

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
new file mode 100644
index 0000000000..0b36dc1ae5
--- /dev/null
+++ b/paddle/operators/block_expand_op.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/block_expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BlockExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("block"),
+                   "Input(block) of BlockExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("padding"),
+                   "Input(padding) of BlockExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("stride"),
+                   "Input(stride) of BlockExpandOp should not be null.");
+    // ctx->SetOutputDim("Out", {1});
+  }
+};
+
+class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BlockExpandOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("block", "The input of block_expand op");
+    AddOutput("stride", "The output of block_expand op");
+    AddComment(R"DOC(
+Expand feature map to minibatch matrix.
+- matrix width is: blockH_ * blockW_ * channels_
+- matirx height is: outputH_ * outputW_
+
+outputH\_ = 1 + (2paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
+            strideH\_ \\
+outputW\_ = 1 + (2paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
+            strideW\_
+
+The expand method is the same with ExpandConvLayer, but saved the transposed
+value. After expanding, output_.sequenceStartPositions will store timeline.
+The number of time steps are outputH_outputW_ and the dimension of each
+time step is blockH_ * blockW_ * channels_. This layer can be used after
+convolution neural network, and before recurrent neural network.
+)DOC");
+  }
+};
+
+class BlockExpandGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(block_expand, ops::BlockExpandOp, ops::BlockExpandOpMaker,
+            block_expand_grad, ops::BlockExpandOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    block_expand, ops::BlockExpanddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    block_expand_grad,
+    ops::BlockExpandGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/block_expand_op.cu b/paddle/operators/block_expand_op.cu
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
new file mode 100644
index 0000000000..54a9c5354f
--- /dev/null
+++ b/paddle/operators/block_expand_op.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/math_function.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class BlockExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using namespace framework;
+    const Tensor* input = context.Input<Tensor>("input");
+    const Tensor* filter = context.Input<Tensor>("filter");
+    const Tensor* stride = context.Input<Tensor>("stride");
+    const Tensor* padding = context.Input<Tensor>("padding");
+    Tensor* out = context.Output<Tensor>("Out");
+
+    auto input_dim = input->dims();
+    size_t N = input_dim[0];
+    size_t C = input_dim[1];
+    PADDLE_ENFORCE_GE(N, 1, "Input batchsize must >= 1.");
+    PADDLE_ENFORCE_EQ(input_dim.size(), 4, "Input format  must be NCHW.");
+
+    size_t input_height = input_dim[2];
+    size_t input_height = input_dim[3];
+
+    size_t filter_height = filter[0];
+    size_t filter_width = filter[1];
+
+    size_t output_height = 1 +
+                           (input_height + 2 * padding_height - block_height() +
+                            stride_height - 1) /
+                               stride_height;
+
+    size_t output_width =
+        1 +
+        (input_width + 2 * padding_width - block_width() + stride_width - 1) /
+            stride_width;
+
+    Tensor col;
+    if (clo_format = KCFO) {
+      col.Resize(
+          {N, C, filter_height, filter_width, output_height, output_width});
+    } else {
+      col.Resize(
+          {N, output_height, output_width, C, filter_height, filter_width});
+    }
+
+    for (size_t i = 0; i < N; i++) {
+      Im2ColFunctor<col_format, place, T>(ctx, one_img, col, stride[0],
+                                          stride[1], padding[0], padding[1]);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class BlockExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    /*
+  int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
+  int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
+  const Tensor* x = ctx.Input<Tensor>("X");
+  const Tensor* y = ctx.Input<Tensor>("Y");
+  */
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

From d2fda53217bf7c5370446f9a404b711ace9df130 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 12 Oct 2017 09:34:28 +0000
Subject: [PATCH 02/54] add expand comment

---
 paddle/operators/block_expand_op.cc | 40 +++++++++++++++++-----
 paddle/operators/block_expand_op.h  | 52 ++++++++++++++---------------
 2 files changed, 58 insertions(+), 34 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index 0b36dc1ae5..69c5e02a65 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -23,12 +23,18 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("block"),
-                   "Input(block) of BlockExpandOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("padding"),
-                   "Input(padding) of BlockExpandOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("stride"),
-                   "Input(stride) of BlockExpandOp should not be null.");
+    using namespace framework;
+    PADDLE_ENFORCE(ctx->HasInput("input"),
+                   "Input of BlockExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of BlockExpandOp op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("input");
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
+    PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
+
+    ctx->ShareLoD("X", /*->*/ "Out");
+
     // ctx->SetOutputDim("Out", {1});
   }
 };
@@ -38,8 +44,26 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
   BlockExpandOpMaker(framework::OpProto* proto,
                      framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("block", "The input of block_expand op");
-    AddOutput("stride", "The output of block_expand op");
+    AddInput("input", "The input of block_expand op");
+    AddOutput("out", "The output of block_expand op");
+    AddAttr<int>("block_height",
+                 R"DOC(
+        )DOC");
+    AddAttr<int>("block_width",
+                 R"DOC(
+        )DOC");
+    AddAttr<int>("stride_height",
+                 R"DOC(
+        )DOC");
+    AddAttr<int>("stride_width",
+                 R"DOC(
+        )DOC");
+    AddAttr<int>("padding_height",
+                 R"DOC(
+        )DOC");
+    AddAttr<int>("padding_width",
+                 R"DOC(
+        )DOC");
     AddComment(R"DOC(
 Expand feature map to minibatch matrix.
 - matrix width is: blockH_ * blockW_ * channels_
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 54a9c5354f..c0521dbbad 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -25,34 +25,34 @@ namespace operators {
 template <typename Place, typename T>
 class BlockExpandKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
-    const Tensor* input = context.Input<Tensor>("input");
-    const Tensor* filter = context.Input<Tensor>("filter");
-    const Tensor* stride = context.Input<Tensor>("stride");
-    const Tensor* padding = context.Input<Tensor>("padding");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    auto input_dim = input->dims();
-    size_t N = input_dim[0];
-    size_t C = input_dim[1];
-    PADDLE_ENFORCE_GE(N, 1, "Input batchsize must >= 1.");
-    PADDLE_ENFORCE_EQ(input_dim.size(), 4, "Input format  must be NCHW.");
-
-    size_t input_height = input_dim[2];
-    size_t input_height = input_dim[3];
-
-    size_t filter_height = filter[0];
-    size_t filter_width = filter[1];
-
-    size_t output_height = 1 +
-                           (input_height + 2 * padding_height - block_height() +
-                            stride_height - 1) /
-                               stride_height;
-
-    size_t output_width =
+    const Tensor* in = ctx.Input<Tensor>("input");
+    Tensor* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dim = in->dims();
+    int N = in_dim[0];
+    int C = in_dim[1];
+
+    int in_height = in_dim[2];
+    int in_width = in_dim[3];
+
+    int block_height = ctx.Attr<int>("block_height");
+    int block_width = ctx.Attr<int>("block_width");
+    int stride_height = ctx.Attr<int>("stride_height");
+    int stride_width = ctx.Attr<int>("stride_width");
+    int padding_height = ctx.Attr<int>("padding_height");
+    int padding_width = ctx.Attr<int>("padding_width");
+
+    int output_height =
+        1 +
+        (in_height + 2 * padding_height - block_height + stride_height - 1) /
+            stride_height;
+
+    int output_width =
         1 +
-        (input_width + 2 * padding_width - block_width() + stride_width - 1) /
+        (in_width + 2 * padding_width - block_width + stride_width - 1) /
             stride_width;
 
     Tensor col;

From f1ca3f7e5ed13fc23acb2ce79f756e939e604031 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 16 Oct 2017 11:43:21 +0000
Subject: [PATCH 03/54] add block forward

---
 paddle/operators/block_expand_op.cc | 94 +++++++++++++++++------------
 paddle/operators/block_expand_op.h  | 81 +++++++++++++------------
 2 files changed, 98 insertions(+), 77 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index 69c5e02a65..ec46737400 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -24,18 +24,43 @@ class BlockExpandOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     using namespace framework;
-    PADDLE_ENFORCE(ctx->HasInput("input"),
+    PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input of BlockExpandOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of BlockExpandOp op should not be null.");
+                   "Output of BlockExpandOp op should not be null.");
 
-    auto in_dim = ctx->GetInputDim("input");
+    auto in_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
     PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
-    ctx->ShareLoD("X", /*->*/ "Out");
-
-    // ctx->SetOutputDim("Out", {1});
+    int blockHeight = ctx->Attrs().Get<int>("blockHeight");
+    int blockWidth = ctx->Attrs().Get<int>("blockWidth");
+    int strideHeight = ctx->Attrs().Get<int>("strideHeight");
+    int strideWidth = ctx->Attrs().Get<int>("strideWidth");
+    int paddingHeight = ctx->Attrs().Get<int>("paddingHeight");
+    int paddingWidth = ctx->Attrs().Get<int>("paddingWidth");
+
+    int N = in_dim[0];
+    int C = in_dim[1];
+    int imgHeight = in_dim[3];
+    int imgWidth = in_dim[4];
+
+    int outputHeight = 0;
+    int outputWidth = 0;
+
+    get_blockexpand_output_shape(imgHeight, imgWidth, blockHeight, blockWidth,
+                                 strideHeight, strideWidth, paddingHeight,
+                                 paddingWidth, outputHeight, outputWidth);
+
+    // The result of im2col is [outputHeight, outputWidth,
+    // inputChannels, filterHeight, filterWidth], and it is easy to
+    // reshape into [seqLength, stepSize], where seqLength is equal
+    // outputHeight * outputWidth, stepSize is equal
+    // input_channels * blockHeight * blockWidth
+    ctx->SetOutputDim(
+        "Out", {N, outputHeight, outputWidth, C, blockHeight, blockWidth});
+
+    // ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -44,41 +69,36 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
   BlockExpandOpMaker(framework::OpProto* proto,
                      framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "The input of block_expand op");
-    AddOutput("out", "The output of block_expand op");
-    AddAttr<int>("block_height",
-                 R"DOC(
-        )DOC");
-    AddAttr<int>("block_width",
-                 R"DOC(
-        )DOC");
-    AddAttr<int>("stride_height",
-                 R"DOC(
-        )DOC");
-    AddAttr<int>("stride_width",
-                 R"DOC(
-        )DOC");
-    AddAttr<int>("padding_height",
-                 R"DOC(
-        )DOC");
-    AddAttr<int>("padding_width",
-                 R"DOC(
-        )DOC");
+    AddInput("X", R"DOC(
+(Tensor)The input tensor has NCHW format.
+    N: batch size
+    C: channels
+    H: height
+    W: width
+)DOC");
+    AddOutput("Out", "(LodTensor)The output data of block_expand op,");
+    AddAttr<int>("blockHeight", "(int)height of block.");
+    AddAttr<int>("blockWidth", "(int)width of block.");
+    AddAttr<int>("strideHeight", "(int)height of stride.");
+    AddAttr<int>("strideWidth", "(int)width of stride.");
+    AddAttr<int>("paddingHeight", "(int)height of padding.");
+    AddAttr<int>("paddingWidth", "(int)width of padding.");
     AddComment(R"DOC(
 Expand feature map to minibatch matrix.
-- matrix width is: blockH_ * blockW_ * channels_
-- matirx height is: outputH_ * outputW_
+- matirx height is: outputHeight * outputWidth
+- matrix width is: blockHeight * blockWidth * channels
 
-outputH\_ = 1 + (2paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
-            strideH\_ \\
-outputW\_ = 1 + (2paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
-            strideW\_
+outputHeight = 
+    1 + (2 * paddingHeight + imgHeight - blockHeight + strideHeight - 1) /
+            strideHeight;
+outputWidth = 
+    1 + (2 * paddingWidth + imgWidth - blockWidth + strideWidth - 1) /
+            strideWidth;
 
 The expand method is the same with ExpandConvLayer, but saved the transposed
-value. After expanding, output_.sequenceStartPositions will store timeline.
-The number of time steps are outputH_outputW_ and the dimension of each
-time step is blockH_ * blockW_ * channels_. This layer can be used after
-convolution neural network, and before recurrent neural network.
+value. After expanding, The number of time steps are outputHeight * outputWidth
+and the dimension of each time step is blockHeight * blockWidth * channels.
+This layer can be used after convolution neural network, and before recurrent neural network.
 )DOC");
   }
 };
@@ -98,7 +118,7 @@ namespace ops = paddle::operators;
 REGISTER_OP(block_expand, ops::BlockExpandOp, ops::BlockExpandOpMaker,
             block_expand_grad, ops::BlockExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    block_expand, ops::BlockExpanddKernel<paddle::platform::CPUPlace, float>);
+    block_expand, ops::BlockExpandKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
     block_expand_grad,
     ops::BlockExpandGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index c0521dbbad..58f9e4c6ad 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -18,10 +18,26 @@
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/img2col.h"
 
 namespace paddle {
 namespace operators {
 
+inline void get_blockexpand_output_shape(int imgHeight, int imgWidth,
+                                         int blockHeight, int blockWidth,
+                                         int strideHeight, int strideWidth,
+                                         int paddingHeight, int paddingWidth,
+                                         int& outputHeight, int& outputWidth) {
+  outputHeight =
+      1 +
+      (imgHeight + 2 * paddingHeight - blockHeight + strideHeight - 1) /
+          strideHeight;
+
+  outputWidth = 1 +
+                (imgWidth + 2 * paddingWidth - blockWidth + strideWidth - 1) /
+                    strideWidth;
+}
+
 template <typename Place, typename T>
 class BlockExpandKernel : public framework::OpKernel<T> {
  public:
@@ -34,39 +50,30 @@ class BlockExpandKernel : public framework::OpKernel<T> {
     auto in_dim = in->dims();
     int N = in_dim[0];
     int C = in_dim[1];
-
-    int in_height = in_dim[2];
-    int in_width = in_dim[3];
-
-    int block_height = ctx.Attr<int>("block_height");
-    int block_width = ctx.Attr<int>("block_width");
-    int stride_height = ctx.Attr<int>("stride_height");
-    int stride_width = ctx.Attr<int>("stride_width");
-    int padding_height = ctx.Attr<int>("padding_height");
-    int padding_width = ctx.Attr<int>("padding_width");
-
-    int output_height =
-        1 +
-        (in_height + 2 * padding_height - block_height + stride_height - 1) /
-            stride_height;
-
-    int output_width =
-        1 +
-        (in_width + 2 * padding_width - block_width + stride_width - 1) /
-            stride_width;
-
-    Tensor col;
-    if (clo_format = KCFO) {
-      col.Resize(
-          {N, C, filter_height, filter_width, output_height, output_width});
-    } else {
-      col.Resize(
-          {N, output_height, output_width, C, filter_height, filter_width});
-    }
-
-    for (size_t i = 0; i < N; i++) {
-      Im2ColFunctor<col_format, place, T>(ctx, one_img, col, stride[0],
-                                          stride[1], padding[0], padding[1]);
+    int imgHeight = in_dim[2];
+    int imgWidth = in_dim[3];
+
+    int blockHeight = ctx.Attr<int>("blockHeight");
+    int blockWidth = ctx.Attr<int>("blockWidth");
+    int strideHeight = ctx.Attr<int>("strideHeight");
+    int strideWidth = ctx.Attr<int>("strideWidth");
+    int paddingHeight = ctx.Attr<int>("paddingHeight");
+    int paddingWidth = ctx.Attr<int>("paddingWidth");
+
+    int outputHeight = 0;
+    int outputWidth = 0;
+
+    get_blockexpand_output_shape(imgHeight, imgWidth, blockHeight, blockWidth,
+                                 strideHeight, strideWidth, paddingHeight,
+                                 paddingWidth, outputHeight, outputWidth);
+
+    for (int i = 0; i < N; i++) {
+      Tensor src = in->Slice<T>(i, i + 1).Resize(C, imgHeight, imgWidth);
+      Tensor dst = out->Slice<T>(i, i + 1).Resize(outputHeight, outputWidth, C,
+                                                  blockHeight, blockWidth);
+      math::Im2ColFunctor<kOCF, ctx->GetPlace(), T>(ctx, src, dst, strideHeight,
+                                                    strideWidth, paddingHeight,
+                                                    paddingWidth);
     }
   }
 };
@@ -75,13 +82,7 @@ template <typename Place, typename T>
 class BlockExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-    /*
-  int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
-  int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-  const Tensor* x = ctx.Input<Tensor>("X");
-  const Tensor* y = ctx.Input<Tensor>("Y");
-  */
+    using namespace framework;
   }
 };
 

From 6197c09bf92324696b237bf0320ce43d28097c70 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 16 Oct 2017 12:08:30 +0000
Subject: [PATCH 04/54] modify styles

---
 paddle/operators/block_expand_op.cc | 45 ++++++++++++------------
 paddle/operators/block_expand_op.h  | 53 +++++++++++++++--------------
 2 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index ec46737400..b3fad3c81f 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -33,32 +33,33 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
     PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
-    int blockHeight = ctx->Attrs().Get<int>("blockHeight");
-    int blockWidth = ctx->Attrs().Get<int>("blockWidth");
-    int strideHeight = ctx->Attrs().Get<int>("strideHeight");
-    int strideWidth = ctx->Attrs().Get<int>("strideWidth");
-    int paddingHeight = ctx->Attrs().Get<int>("paddingHeight");
-    int paddingWidth = ctx->Attrs().Get<int>("paddingWidth");
+    int block_height = ctx->Attrs().Get<int>("blockHeight");
+    int block_width = ctx->Attrs().Get<int>("blockWidth");
+    int stride_height = ctx->Attrs().Get<int>("strideHeight");
+    int stride_width = ctx->Attrs().Get<int>("strideWidth");
+    int padding_height = ctx->Attrs().Get<int>("paddingHeight");
+    int padding_width = ctx->Attrs().Get<int>("paddingWidth");
 
     int N = in_dim[0];
     int C = in_dim[1];
-    int imgHeight = in_dim[3];
-    int imgWidth = in_dim[4];
+    int img_height = in_dim[3];
+    int img_width = in_dim[4];
 
-    int outputHeight = 0;
-    int outputWidth = 0;
+    int output_height = 0;
+    int output_width = 0;
 
-    get_blockexpand_output_shape(imgHeight, imgWidth, blockHeight, blockWidth,
-                                 strideHeight, strideWidth, paddingHeight,
-                                 paddingWidth, outputHeight, outputWidth);
+    get_blockexpand_output_shape(img_height, img_width, block_height,
+                                 block_width, stride_height, stride_width,
+                                 padding_height, padding_width, output_height,
+                                 output_width);
 
-    // The result of im2col is [outputHeight, outputWidth,
+    // The result of im2col is [output_height, output_width,
     // inputChannels, filterHeight, filterWidth], and it is easy to
     // reshape into [seqLength, stepSize], where seqLength is equal
-    // outputHeight * outputWidth, stepSize is equal
+    // output_height * output_width, stepSize is equal
     // input_channels * blockHeight * blockWidth
     ctx->SetOutputDim(
-        "Out", {N, outputHeight, outputWidth, C, blockHeight, blockWidth});
+        "Out", {N, output_height, output_width, C, block_height, block_width});
 
     // ctx->ShareLoD("X", /*->*/ "Out");
   }
@@ -85,18 +86,18 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("paddingWidth", "(int)width of padding.");
     AddComment(R"DOC(
 Expand feature map to minibatch matrix.
-- matirx height is: outputHeight * outputWidth
+- matirx height is: output_height * output_width
 - matrix width is: blockHeight * blockWidth * channels
 
-outputHeight = 
-    1 + (2 * paddingHeight + imgHeight - blockHeight + strideHeight - 1) /
+output_height = 
+    1 + (2 * paddingHeight + img_height - blockHeight + strideHeight - 1) /
             strideHeight;
-outputWidth = 
-    1 + (2 * paddingWidth + imgWidth - blockWidth + strideWidth - 1) /
+output_width = 
+    1 + (2 * paddingWidth + img_width - blockWidth + strideWidth - 1) /
             strideWidth;
 
 The expand method is the same with ExpandConvLayer, but saved the transposed
-value. After expanding, The number of time steps are outputHeight * outputWidth
+value. After expanding, The number of time steps are output_height * output_width
 and the dimension of each time step is blockHeight * blockWidth * channels.
 This layer can be used after convolution neural network, and before recurrent neural network.
 )DOC");
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 58f9e4c6ad..bd6b307852 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -18,24 +18,25 @@
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/img2col.h"
+#include "paddle/operators/math/im2col.h"
 
 namespace paddle {
 namespace operators {
 
-inline void get_blockexpand_output_shape(int imgHeight, int imgWidth,
-                                         int blockHeight, int blockWidth,
-                                         int strideHeight, int strideWidth,
-                                         int paddingHeight, int paddingWidth,
+inline void get_blockexpand_output_shape(int img_height, int img_width,
+                                         int block_height, int block_width,
+                                         int stride_height, int stride_width,
+                                         int padding_height, int padding_width,
                                          int& outputHeight, int& outputWidth) {
   outputHeight =
       1 +
-      (imgHeight + 2 * paddingHeight - blockHeight + strideHeight - 1) /
-          strideHeight;
+      (img_height + 2 * padding_height - block_height + stride_height - 1) /
+          stride_height;
 
-  outputWidth = 1 +
-                (imgWidth + 2 * paddingWidth - blockWidth + strideWidth - 1) /
-                    strideWidth;
+  outputWidth =
+      1 +
+      (img_width + 2 * padding_width - block_width + stride_width - 1) /
+          stride_width;
 }
 
 template <typename Place, typename T>
@@ -50,30 +51,30 @@ class BlockExpandKernel : public framework::OpKernel<T> {
     auto in_dim = in->dims();
     int N = in_dim[0];
     int C = in_dim[1];
-    int imgHeight = in_dim[2];
-    int imgWidth = in_dim[3];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
 
-    int blockHeight = ctx.Attr<int>("blockHeight");
-    int blockWidth = ctx.Attr<int>("blockWidth");
-    int strideHeight = ctx.Attr<int>("strideHeight");
-    int strideWidth = ctx.Attr<int>("strideWidth");
-    int paddingHeight = ctx.Attr<int>("paddingHeight");
-    int paddingWidth = ctx.Attr<int>("paddingWidth");
+    int block_height = ctx.Attr<int>("blockHeight");
+    int block_width = ctx.Attr<int>("blockWidth");
+    int stride_height = ctx.Attr<int>("strideHeight");
+    int stride_width = ctx.Attr<int>("strideWidth");
+    int padding_height = ctx.Attr<int>("paddingHeight");
+    int padding_width = ctx.Attr<int>("paddingWidth");
 
     int outputHeight = 0;
     int outputWidth = 0;
 
-    get_blockexpand_output_shape(imgHeight, imgWidth, blockHeight, blockWidth,
-                                 strideHeight, strideWidth, paddingHeight,
-                                 paddingWidth, outputHeight, outputWidth);
+    get_blockexpand_output_shape(
+        img_height, img_width, block_height, block_width, stride_height,
+        stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
     for (int i = 0; i < N; i++) {
-      Tensor src = in->Slice<T>(i, i + 1).Resize(C, imgHeight, imgWidth);
+      Tensor src = in->Slice<T>(i, i + 1).Resize(C, img_height, img_width);
       Tensor dst = out->Slice<T>(i, i + 1).Resize(outputHeight, outputWidth, C,
-                                                  blockHeight, blockWidth);
-      math::Im2ColFunctor<kOCF, ctx->GetPlace(), T>(ctx, src, dst, strideHeight,
-                                                    strideWidth, paddingHeight,
-                                                    paddingWidth);
+                                                  block_height, block_width);
+      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T>(
+          ctx, src, dst, stride_height, stride_width, padding_height,
+          padding_width);
     }
   }
 };

From 5a9dd8ae5a5154a4e2a96becc057621a6221ca55 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 17 Oct 2017 03:48:34 +0000
Subject: [PATCH 05/54] add gpu

---
 paddle/operators/block_expand_op.cc | 15 ++++++++--
 paddle/operators/block_expand_op.cu | 24 +++++++++++++++
 paddle/operators/block_expand_op.h  | 46 +++++++++++++++++++++++++----
 3 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index b3fad3c81f..49c7011fe1 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -109,7 +109,18 @@ class BlockExpandGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {}
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    using namespace framework;
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output of BlockExpandOp op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto in_dim = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim(GradVarName("Out"), in_dim);
+  }
 };
 
 }  // namespace operators
@@ -117,7 +128,7 @@ class BlockExpandGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(block_expand, ops::BlockExpandOp, ops::BlockExpandOpMaker,
-            block_expand_grad, ops::BlockExpandOpGrad);
+            block_expand_grad, ops::BlockExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
     block_expand, ops::BlockExpandKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/operators/block_expand_op.cu b/paddle/operators/block_expand_op.cu
index e69de29bb2..492ac0c9b2 100644
--- a/paddle/operators/block_expand_op.cu
+++ b/paddle/operators/block_expand_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/block_expand_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    block_expand, ops::BlockExpandKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    block_expand_grad,
+    ops::BlockExpandGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index bd6b307852..b272582883 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -69,12 +69,12 @@ class BlockExpandKernel : public framework::OpKernel<T> {
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
     for (int i = 0; i < N; i++) {
-      Tensor src = in->Slice<T>(i, i + 1).Resize(C, img_height, img_width);
-      Tensor dst = out->Slice<T>(i, i + 1).Resize(outputHeight, outputWidth, C,
-                                                  block_height, block_width);
-      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T>(
-          ctx, src, dst, stride_height, stride_width, padding_height,
-          padding_width);
+      Tensor src = in->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
+      Tensor dst = out->Slice<T>(i, i + 1).Resize(
+          {outputHeight, outputWidth, C, block_height, block_width});
+      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
+      f(ctx.device_context(), src, dst, stride_height, stride_width,
+        padding_height, padding_width);
     }
   }
 };
@@ -84,6 +84,40 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
+    auto* in = ctx.Input<Tensor>("X");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* out_grad = ctx.Output<Tensor>(GradVarName("Out"));
+    out_grad->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dim = in->dims();
+    int N = in_dim[0];
+    int C = in_dim[1];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
+
+    int block_height = ctx.Attr<int>("blockHeight");
+    int block_width = ctx.Attr<int>("blockWidth");
+    int stride_height = ctx.Attr<int>("strideHeight");
+    int stride_width = ctx.Attr<int>("strideWidth");
+    int padding_height = ctx.Attr<int>("paddingHeight");
+    int padding_width = ctx.Attr<int>("paddingWidth");
+
+    int outputHeight = 0;
+    int outputWidth = 0;
+
+    get_blockexpand_output_shape(
+        img_height, img_width, block_height, block_width, stride_height,
+        stride_width, padding_height, padding_width, outputHeight, outputWidth);
+
+    for (int i = 0; i < N; i++) {
+      Tensor dst =
+          out_grad->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
+      Tensor src = out->Slice<T>(i, i + 1).Resize(
+          {outputHeight, outputWidth, C, block_height, block_width});
+      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
+      f(ctx.device_context(), src, dst, stride_height, stride_width,
+        padding_height, padding_width);
+    }
   }
 };
 

From 45f16c90456775d80ec3fbff5c87d17c06558c5b Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 17 Oct 2017 07:15:19 +0000
Subject: [PATCH 06/54] add py test

---
 .../framework/tests/test_block_expand_op.py   | 121 ++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 python/paddle/v2/framework/tests/test_block_expand_op.py

diff --git a/python/paddle/v2/framework/tests/test_block_expand_op.py b/python/paddle/v2/framework/tests/test_block_expand_op.py
new file mode 100644
index 0000000000..aa4fa479a9
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_block_expand_op.py
@@ -0,0 +1,121 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def get_output_shape(attrs, X):
+    img_height = X.shape[2]
+    img_width = X.shpe[3]
+    padding_height = attrs['padding_height']
+    padding_width = attrs['padding_width']
+    block_height = attrs['block_height']
+    block_width = attrs['block_width']
+    stride_height = attrs['stride_height']
+    stride_width = attrs['stride_width']
+    output_height = \
+      1 +  \
+      (img_height + 2 * padding_height - block_height + stride_height - 1) / \
+          stride_height
+
+    output_width = \
+      1 + \
+      (img_width + 2 * padding_width - block_width + stride_width - 1) / \
+          stride_width
+
+    return output_height, output_width
+
+
+"""
+img: {CHW}
+col:
+    {output_height, output_width, inputChannels, filterHeight, filterWidth}
+"""
+
+
+def img2col(attrs, im, col):
+    input_channels = im.shape.dims[0]
+    input_height = im.shape.dims[1]
+    input_width = im.shape.dims[2]
+    filter_height = col.shape.dims[3]
+    filter_width = col.shape.dims[4]
+    output_height = col.shape.dims[0]
+    output_width = col.shape.dims[1]
+
+    for col_row_idx in range(0, output_height):
+        for col_col_idx in range(0, output_width):
+            for channel in range(0, input_channels):
+                for filter_row_idx in range(0, filter_height):
+                    for filter_col_idx in range(0, filter_width):
+                        im_row_offset = col_row_idx * stride_height \
+                            + filter_row_idx - padding_height
+                        im_col_offset = col_col_idx * stride_width \
+                            + filter_col_idx - padding_width
+                        if (im_row_offset < 0 or
+                                im_row_offset >= input_height or
+                                im_col_offset < 0 or
+                                im_col_offset >= input_width):
+                            col[col_row_idx][col_col_idx][channel][
+                                filter_row_idx][filter_col_idx] = 0.0
+                        else:
+                            im_offset = (channel * input_height + im_row_offset
+                                         ) * input_width + im_col_offset
+                            col[col_row_idx][col_col_idx][channel][
+                                filter_row_idx][filter_col_idx] = im[channel][
+                                    im_row_offset][im_col_offset]
+
+
+"""
+img: {CHW}
+col:
+    {output_height, output_width, inputChannels, filterHeight, filterWidth}
+"""
+
+
+def col2img(attrs, col, img):
+    input_channels = im.shape.dims[0]
+    input_height = im.shape.dims[1]
+    input_width = im.shape.dims[2]
+    filter_height = col.shape.dims[3]
+    filter_width = col.shape.dims[4]
+    output_height = col.shape.dims[0]
+    output_width = col.shape.dims[1]
+
+    for col_row_idx in range(0, output_height):
+        for col_col_idx in range(0, output_width):
+            for channel in range(0, input_channels):
+                for filter_row_idx in range(0, filter_height):
+                    for filter_col_idx in range(0, filter_width):
+                        im_row_offset = \
+                            col_row_idx * stride_height + filter_row_idx - padding_height
+                        im_col_offset = \
+                            col_col_idx * stride_width + filter_col_idx - padding_width
+                        if (im_row_offset >= 0 and
+                                im_row_offset < input_height and
+                                im_col_offset >= 0 and
+                                im_col_offset < input_width):
+                            im[channel][im_row_offset][im_col_offset] = \
+                                col[col_row_idx][col_col_idx][channel][filter_row_idx][filter_col_idx]
+
+
+class TestBlockExpandMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "block_expand"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 9, 9]).astype("float64"),
+        }
+        self.attrs = {
+            'block_height': 3,
+            'block_width': 3,
+            'stride_height': 2,
+            'stride_width': 2,
+            'padding_height': 3,
+            'padding_width': 3,
+        }
+
+        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')

From 32db8db51c5384f213a0b1402d2632519da5416a Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 17 Oct 2017 08:12:56 +0000
Subject: [PATCH 07/54] fix bugs

---
 paddle/operators/block_expand_op.cc           |   9 +-
 paddle/operators/block_expand_op.h            |   9 +-
 .../framework/tests/test_block_expand_op.py   | 176 +++++++++++-------
 3 files changed, 120 insertions(+), 74 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index 49c7011fe1..37ea57f393 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -23,6 +23,7 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
+    printf("op infershape\n");
     using namespace framework;
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input of BlockExpandOp should not be null.");
@@ -33,6 +34,7 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
     PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
+    printf("op infershape2\n");
     int block_height = ctx->Attrs().Get<int>("blockHeight");
     int block_width = ctx->Attrs().Get<int>("blockWidth");
     int stride_height = ctx->Attrs().Get<int>("strideHeight");
@@ -42,8 +44,8 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
     int N = in_dim[0];
     int C = in_dim[1];
-    int img_height = in_dim[3];
-    int img_width = in_dim[4];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
 
     int output_height = 0;
     int output_width = 0;
@@ -58,6 +60,8 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     // reshape into [seqLength, stepSize], where seqLength is equal
     // output_height * output_width, stepSize is equal
     // input_channels * blockHeight * blockWidth
+    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, output_height,
+           output_width, C, block_height, block_width);
     ctx->SetOutputDim(
         "Out", {N, output_height, output_width, C, block_height, block_width});
 
@@ -77,6 +81,7 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     H: height
     W: width
 )DOC");
+    printf("opmakeer\n");
     AddOutput("Out", "(LodTensor)The output data of block_expand op,");
     AddAttr<int>("blockHeight", "(int)height of block.");
     AddAttr<int>("blockWidth", "(int)width of block.");
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index b272582883..69bd7d6987 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -44,7 +44,7 @@ class BlockExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
-    const Tensor* in = ctx.Input<Tensor>("input");
+    const Tensor* in = ctx.Input<Tensor>("X");
     Tensor* out = ctx.Output<Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -68,7 +68,11 @@ class BlockExpandKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
+    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, outputHeight,
+           outputWidth, C, block_height, block_width);
+
     for (int i = 0; i < N; i++) {
+      printf("i:%d\n", i);
       Tensor src = in->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
       Tensor dst = out->Slice<T>(i, i + 1).Resize(
           {outputHeight, outputWidth, C, block_height, block_width});
@@ -109,6 +113,9 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
+    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, outputHeight,
+           outputWidth, C, block_height, block_width);
+
     for (int i = 0; i < N; i++) {
       Tensor dst =
           out_grad->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
diff --git a/python/paddle/v2/framework/tests/test_block_expand_op.py b/python/paddle/v2/framework/tests/test_block_expand_op.py
index aa4fa479a9..f8f4afc880 100644
--- a/python/paddle/v2/framework/tests/test_block_expand_op.py
+++ b/python/paddle/v2/framework/tests/test_block_expand_op.py
@@ -3,119 +3,153 @@ import numpy as np
 from op_test import OpTest
 
 
-def get_output_shape(attrs, X):
-    img_height = X.shape[2]
-    img_width = X.shpe[3]
-    padding_height = attrs['padding_height']
-    padding_width = attrs['padding_width']
-    block_height = attrs['block_height']
-    block_width = attrs['block_width']
-    stride_height = attrs['stride_height']
-    stride_width = attrs['stride_width']
-    output_height = \
+def get_output_shape(attrs, x):
+    imgHeight = x.shape[1]
+    imgWidth = x.shape[2]
+
+    paddingHeight = attrs['paddingHeight']
+    paddingWidth = attrs['paddingWidth']
+    blockHeight = attrs['blockHeight']
+    blockWidth = attrs['blockWidth']
+    strideHeight = attrs['strideHeight']
+    strideWidth = attrs['strideWidth']
+
+    outputHeight = \
       1 +  \
-      (img_height + 2 * padding_height - block_height + stride_height - 1) / \
-          stride_height
+      (imgHeight + 2 * paddingHeight - blockHeight + strideHeight - 1) / \
+          strideHeight
 
-    output_width = \
+    outputWidth = \
       1 + \
-      (img_width + 2 * padding_width - block_width + stride_width - 1) / \
-          stride_width
+      (imgWidth + 2 * paddingWidth - blockWidth + strideWidth - 1) / \
+          strideWidth
 
-    return output_height, output_width
+    return outputHeight, outputWidth
 
 
 """
-img: {CHW}
+im: {CHW}
 col:
-    {output_height, output_width, inputChannels, filterHeight, filterWidth}
+    {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
 """
 
 
-def img2col(attrs, im, col):
-    input_channels = im.shape.dims[0]
-    input_height = im.shape.dims[1]
-    input_width = im.shape.dims[2]
-    filter_height = col.shape.dims[3]
-    filter_width = col.shape.dims[4]
-    output_height = col.shape.dims[0]
-    output_width = col.shape.dims[1]
+def im2col(attrs, im, col):
+    input_channels = im.shape[0]
+    inputHeight = im.shape[1]
+    inputWidth = im.shape[2]
+
+    outputHeight = col.shape[0]
+    outputWidth = col.shape[1]
+    filterHeight = col.shape[3]
+    filterWidth = col.shape[4]
 
-    for col_row_idx in range(0, output_height):
-        for col_col_idx in range(0, output_width):
+    strideHeight = attrs['strideHeight']
+    strideWidth = attrs['strideWidth']
+    paddingHeight = attrs['paddingHeight']
+    paddingWidth = attrs['paddingWidth']
+
+    for col_row_idx in range(0, outputHeight):
+        for col_col_idx in range(0, outputWidth):
             for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filter_height):
-                    for filter_col_idx in range(0, filter_width):
-                        im_row_offset = col_row_idx * stride_height \
-                            + filter_row_idx - padding_height
-                        im_col_offset = col_col_idx * stride_width \
-                            + filter_col_idx - padding_width
-                        if (im_row_offset < 0 or
-                                im_row_offset >= input_height or
+                for filter_row_idx in range(0, filterHeight):
+                    for filter_col_idx in range(0, filterWidth):
+                        im_row_offset = col_row_idx * strideHeight \
+                            + filter_row_idx - paddingHeight
+
+                        im_col_offset = col_col_idx * strideWidth \
+                            + filter_col_idx - paddingWidth
+
+                        if (im_row_offset < 0 or im_row_offset >= inputHeight or
                                 im_col_offset < 0 or
-                                im_col_offset >= input_width):
-                            col[col_row_idx][col_col_idx][channel][
+                                im_col_offset >= inputWidth):
+                            col[col_row_idx][col_col_idx][channel][\
                                 filter_row_idx][filter_col_idx] = 0.0
                         else:
-                            im_offset = (channel * input_height + im_row_offset
-                                         ) * input_width + im_col_offset
-                            col[col_row_idx][col_col_idx][channel][
-                                filter_row_idx][filter_col_idx] = im[channel][
+                            im_offset = (channel * inputHeight + im_row_offset \
+                                         ) * inputWidth + im_col_offset
+
+                            col[col_row_idx][col_col_idx][channel][\
+                                filter_row_idx][filter_col_idx] = im[channel][ \
                                     im_row_offset][im_col_offset]
 
 
 """
 img: {CHW}
 col:
-    {output_height, output_width, inputChannels, filterHeight, filterWidth}
+    {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
 """
 
 
 def col2img(attrs, col, img):
-    input_channels = im.shape.dims[0]
-    input_height = im.shape.dims[1]
-    input_width = im.shape.dims[2]
-    filter_height = col.shape.dims[3]
-    filter_width = col.shape.dims[4]
-    output_height = col.shape.dims[0]
-    output_width = col.shape.dims[1]
-
-    for col_row_idx in range(0, output_height):
-        for col_col_idx in range(0, output_width):
+    input_channels = im.shape[0]
+    inputHeight = im.shape[1]
+    inputWidth = im.shape[2]
+
+    outputHeight = col.shape[0]
+    outputWidth = col.shape[1]
+    filterHeight = col.shape[3]
+    filterWidth = col.shape[4]
+
+    strideHeight = attrs['strideHeight']
+    strideWidth = attrs['strideWidth']
+    paddingHeight = attrs['paddingHeight']
+    paddingWidth = attrs['paddingWidth']
+
+    for col_row_idx in range(0, outputHeight):
+        for col_col_idx in range(0, outputWidth):
             for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filter_height):
-                    for filter_col_idx in range(0, filter_width):
+                for filter_row_idx in range(0, filterHeight):
+                    for filter_col_idx in range(0, filterWidth):
                         im_row_offset = \
-                            col_row_idx * stride_height + filter_row_idx - padding_height
+                            col_row_idx * strideHeight + filter_row_idx - paddingHeight
                         im_col_offset = \
-                            col_col_idx * stride_width + filter_col_idx - padding_width
+                            col_col_idx * strideWidth + filter_col_idx - paddingWidth
                         if (im_row_offset >= 0 and
-                                im_row_offset < input_height and
+                                im_row_offset < inputHeight and
                                 im_col_offset >= 0 and
-                                im_col_offset < input_width):
+                                im_col_offset < inputWidth):
                             im[channel][im_row_offset][im_col_offset] = \
                                 col[col_row_idx][col_col_idx][channel][filter_row_idx][filter_col_idx]
 
 
 class TestBlockExpandMulOp(OpTest):
     def setUp(self):
-        self.op_type = "block_expand"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 9, 9]).astype("float64"),
-        }
-        self.attrs = {
-            'block_height': 3,
-            'block_width': 3,
-            'stride_height': 2,
-            'stride_width': 2,
-            'padding_height': 3,
-            'padding_width': 3,
+        x = np.random.uniform(0.1, 1, [3, 9, 9]).astype("float32")
+        attrs = {
+            'blockHeight': 3,
+            'blockWidth': 3,
+            'strideHeight': 2,
+            'strideWidth': 2,
+            'paddingHeight': 3,
+            'paddingWidth': 3,
         }
 
-        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+        outputHeight, outputWidth = get_output_shape(attrs, x)
+        out = np.random.uniform(0.1, 1,\
+                    [outputHeight, outputWidth, x.shape[0], \
+                     attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
+
+        self.op_type = "block_expand"
+        self.inputs = {'X': x.reshape(1, 3, 9, 9)}
+        self.attrs = attrs
+
+        im2col(attrs, x, out)
+        self.outputs = {
+            'Out':out.reshape(1, outputHeight, outputWidth, x.shape[0], \
+                     attrs['blockHeight'], attrs['blockWidth'])
+            }
+        #print out
 
     def test_check_output(self):
         self.check_output()
+        print 1
 
+    """
     def test_check_grad_normal(self):
         self.check_grad(['X'], 'Out')
+    """
+
+
+if __name__ == '__main__':
+    unittest.main()

From d3ac3393fc803d210a1bab4f89249657b2e8786c Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 17 Oct 2017 12:43:32 +0000
Subject: [PATCH 08/54] fix bugs

---
 paddle/operators/block_expand_op.cc           | 14 +------
 paddle/operators/block_expand_op.h            | 25 +++++-------
 .../framework/tests/test_block_expand_op.py   | 40 ++++++++++++-------
 3 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index 37ea57f393..d72c6b2de1 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -23,7 +23,6 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    printf("op infershape\n");
     using namespace framework;
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input of BlockExpandOp should not be null.");
@@ -34,7 +33,6 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
     PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
-    printf("op infershape2\n");
     int block_height = ctx->Attrs().Get<int>("blockHeight");
     int block_width = ctx->Attrs().Get<int>("blockWidth");
     int stride_height = ctx->Attrs().Get<int>("strideHeight");
@@ -60,8 +58,6 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     // reshape into [seqLength, stepSize], where seqLength is equal
     // output_height * output_width, stepSize is equal
     // input_channels * blockHeight * blockWidth
-    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, output_height,
-           output_width, C, block_height, block_width);
     ctx->SetOutputDim(
         "Out", {N, output_height, output_width, C, block_height, block_width});
 
@@ -81,7 +77,6 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     H: height
     W: width
 )DOC");
-    printf("opmakeer\n");
     AddOutput("Out", "(LodTensor)The output data of block_expand op,");
     AddAttr<int>("blockHeight", "(int)height of block.");
     AddAttr<int>("blockWidth", "(int)width of block.");
@@ -117,14 +112,9 @@ class BlockExpandGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     using namespace framework;
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output of BlockExpandOp op should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-
-    auto in_dim = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim(GradVarName("Out"), in_dim);
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
 
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 69bd7d6987..38d0626c73 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -68,11 +68,7 @@ class BlockExpandKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
-    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, outputHeight,
-           outputWidth, C, block_height, block_width);
-
     for (int i = 0; i < N; i++) {
-      printf("i:%d\n", i);
       Tensor src = in->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
       Tensor dst = out->Slice<T>(i, i + 1).Resize(
           {outputHeight, outputWidth, C, block_height, block_width});
@@ -89,9 +85,12 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
     auto* in = ctx.Input<Tensor>("X");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* out_grad = ctx.Output<Tensor>(GradVarName("Out"));
-    out_grad->mutable_data<T>(ctx.GetPlace());
+    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<Tensor>(GradVarName("X"));
+    d_x->mutable_data<T>(ctx.GetPlace());
+
+    auto x_v = framework::EigenVector<T>::Flatten(*d_x);
+    x_v.device(ctx.GetEigenDevice<Place>()) = x_v.constant(0.0);
 
     auto in_dim = in->dims();
     int N = in_dim[0];
@@ -113,16 +112,12 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
-    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, outputHeight,
-           outputWidth, C, block_height, block_width);
-
     for (int i = 0; i < N; i++) {
-      Tensor dst =
-          out_grad->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
-      Tensor src = out->Slice<T>(i, i + 1).Resize(
+      Tensor dst = d_x->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
+      Tensor src = d_out->Slice<T>(i, i + 1).Resize(
           {outputHeight, outputWidth, C, block_height, block_width});
-      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), src, dst, stride_height, stride_width,
+      math::Col2ImFunctor<math::ColFormat::kOCF, Place, T> f;
+      f(ctx.device_context(), dst, src, stride_height, stride_width,
         padding_height, padding_width);
     }
   }
diff --git a/python/paddle/v2/framework/tests/test_block_expand_op.py b/python/paddle/v2/framework/tests/test_block_expand_op.py
index f8f4afc880..c85f3a1ef1 100644
--- a/python/paddle/v2/framework/tests/test_block_expand_op.py
+++ b/python/paddle/v2/framework/tests/test_block_expand_op.py
@@ -113,16 +113,30 @@ def col2img(attrs, col, img):
                                 col[col_row_idx][col_col_idx][channel][filter_row_idx][filter_col_idx]
 
 
-class TestBlockExpandMulOp(OpTest):
+class TestBlockExpandOp(OpTest):
+    def get_input_data(self, C, H, W):
+        x = np.random.uniform(0.1, 1, [C, H, W]).astype("float32")
+        for c in range(0, C):
+            for h in range(0, H):
+                for w in range(0, W):
+                    #x[c][h][w] = c * H * W + h *W + w
+                    x[c][h][w] = 0.2 + 0.01 * (c * H * W + h * W + w)
+        return x
+
     def setUp(self):
-        x = np.random.uniform(0.1, 1, [3, 9, 9]).astype("float32")
+        C = 3
+        H = 4
+        W = 4
+        x = self.get_input_data(C, H, W)
+        #print x
+
         attrs = {
-            'blockHeight': 3,
-            'blockWidth': 3,
-            'strideHeight': 2,
-            'strideWidth': 2,
-            'paddingHeight': 3,
-            'paddingWidth': 3,
+            'blockHeight': 2,
+            'blockWidth': 2,
+            'strideHeight': 1,
+            'strideWidth': 1,
+            'paddingHeight': 1,
+            'paddingWidth': 1,
         }
 
         outputHeight, outputWidth = get_output_shape(attrs, x)
@@ -131,7 +145,7 @@ class TestBlockExpandMulOp(OpTest):
                      attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
 
         self.op_type = "block_expand"
-        self.inputs = {'X': x.reshape(1, 3, 9, 9)}
+        self.inputs = {'X': x.reshape(1, C, H, W)}
         self.attrs = attrs
 
         im2col(attrs, x, out)
@@ -139,16 +153,14 @@ class TestBlockExpandMulOp(OpTest):
             'Out':out.reshape(1, outputHeight, outputWidth, x.shape[0], \
                      attrs['blockHeight'], attrs['blockWidth'])
             }
-        #print out
 
+    """
     def test_check_output(self):
         self.check_output()
-        print 1
-
     """
+
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
-    """
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
 if __name__ == '__main__':

From 4422a556dca7c9461dd7fdcf91b96a3e429aaf66 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 17 Oct 2017 12:46:33 +0000
Subject: [PATCH 09/54] rm not need

---
 .../framework/tests/test_block_expand_op.py   | 28 ++++++++-----------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_block_expand_op.py b/python/paddle/v2/framework/tests/test_block_expand_op.py
index c85f3a1ef1..4c66493d6e 100644
--- a/python/paddle/v2/framework/tests/test_block_expand_op.py
+++ b/python/paddle/v2/framework/tests/test_block_expand_op.py
@@ -27,14 +27,12 @@ def get_output_shape(attrs, x):
     return outputHeight, outputWidth
 
 
-"""
-im: {CHW}
-col:
-    {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
-"""
-
-
 def im2col(attrs, im, col):
+    """
+    im: {CHW}
+    col:
+        {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
+    """
     input_channels = im.shape[0]
     inputHeight = im.shape[1]
     inputWidth = im.shape[2]
@@ -74,14 +72,12 @@ def im2col(attrs, im, col):
                                     im_row_offset][im_col_offset]
 
 
-"""
-img: {CHW}
-col:
-    {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
-"""
-
-
 def col2img(attrs, col, img):
+    """
+    img: {CHW}
+    col:
+        {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
+    """
     input_channels = im.shape[0]
     inputHeight = im.shape[1]
     inputWidth = im.shape[2]
@@ -154,13 +150,11 @@ class TestBlockExpandOp(OpTest):
                      attrs['blockHeight'], attrs['blockWidth'])
             }
 
-    """
     def test_check_output(self):
         self.check_output()
-    """
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+        self.check_grad(['X'], 'Out')
 
 
 if __name__ == '__main__':

From dbe0583cb0e79bfb156a9816b1ae2e5dfaf2c383 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 21 Nov 2017 09:05:33 +0000
Subject: [PATCH 10/54] mv test position to fluid

---
 .../paddle/v2/{framework => fluid}/tests/test_block_expand_op.py  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename python/paddle/v2/{framework => fluid}/tests/test_block_expand_op.py (100%)

diff --git a/python/paddle/v2/framework/tests/test_block_expand_op.py b/python/paddle/v2/fluid/tests/test_block_expand_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_block_expand_op.py
rename to python/paddle/v2/fluid/tests/test_block_expand_op.py

From 25a3d2d76f0146ac580cb484bb5a638ddc029bfa Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 22 Nov 2017 06:18:47 +0000
Subject: [PATCH 11/54] fix by comments

---
 paddle/operators/block_expand_op.cc           |   3 +-
 paddle/operators/block_expand_op.h            |  22 ++-
 .../v2/fluid/tests/test_block_expand_op.py    | 175 +++++++++++-------
 3 files changed, 123 insertions(+), 77 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index d72c6b2de1..f25cc4f9de 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -30,7 +30,8 @@ class BlockExpandOp : public framework::OperatorWithKernel {
                    "Output of BlockExpandOp op should not be null.");
 
     auto in_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4,
+                      "Input(X) format  must be 4D tensor, eg., NCHW.");
     PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
     int block_height = ctx->Attrs().Get<int>("blockHeight");
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 38d0626c73..aa0db2705c 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -68,13 +68,16 @@ class BlockExpandKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
+    std::vector<int> stride({stride_height, stride_width});
+    std::vector<int> padding({padding_height, padding_width});
+
     for (int i = 0; i < N; i++) {
-      Tensor src = in->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
-      Tensor dst = out->Slice<T>(i, i + 1).Resize(
+      Tensor src = in->Slice(i, i + 1).Resize({C, img_height, img_width});
+      Tensor dst = out->Slice(i, i + 1).Resize(
           {outputHeight, outputWidth, C, block_height, block_width});
+
       math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), src, dst, stride_height, stride_width,
-        padding_height, padding_width);
+      f(ctx.device_context(), src, stride, padding, &dst);
     }
   }
 };
@@ -112,13 +115,16 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
+    std::vector<int> stride({stride_height, stride_width});
+    std::vector<int> padding({padding_height, padding_width});
+    // std::vector<int> stride({stride_height, stride_width});
+
     for (int i = 0; i < N; i++) {
-      Tensor dst = d_x->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
-      Tensor src = d_out->Slice<T>(i, i + 1).Resize(
+      Tensor dst = d_x->Slice(i, i + 1).Resize({C, img_height, img_width});
+      Tensor src = d_out->Slice(i, i + 1).Resize(
           {outputHeight, outputWidth, C, block_height, block_width});
       math::Col2ImFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), dst, src, stride_height, stride_width,
-        padding_height, padding_width);
+      f(ctx.device_context(), dst, stride, padding, &src);
     }
   }
 };
diff --git a/python/paddle/v2/fluid/tests/test_block_expand_op.py b/python/paddle/v2/fluid/tests/test_block_expand_op.py
index 4c66493d6e..b31ed53f4c 100644
--- a/python/paddle/v2/fluid/tests/test_block_expand_op.py
+++ b/python/paddle/v2/fluid/tests/test_block_expand_op.py
@@ -4,27 +4,27 @@ from op_test import OpTest
 
 
 def get_output_shape(attrs, x):
-    imgHeight = x.shape[1]
-    imgWidth = x.shape[2]
+    img_height = x.shape[1]
+    img_width = x.shape[2]
 
-    paddingHeight = attrs['paddingHeight']
-    paddingWidth = attrs['paddingWidth']
-    blockHeight = attrs['blockHeight']
-    blockWidth = attrs['blockWidth']
-    strideHeight = attrs['strideHeight']
-    strideWidth = attrs['strideWidth']
+    padding_height = attrs['paddingHeight']
+    padding_width = attrs['paddingWidth']
+    block_height = attrs['blockHeight']
+    block_width = attrs['blockWidth']
+    stride_height = attrs['strideHeight']
+    stride_width = attrs['strideWidth']
 
-    outputHeight = \
+    output_height = \
       1 +  \
-      (imgHeight + 2 * paddingHeight - blockHeight + strideHeight - 1) / \
+      (img_height + 2 * padding_height - block_height + stride_height - 1) / \
           strideHeight
 
-    outputWidth = \
+    output_width = \
       1 + \
-      (imgWidth + 2 * paddingWidth - blockWidth + strideWidth - 1) / \
-          strideWidth
+      (img_width + 2 * padding_width - block_width + stride_width - 1) / \
+          stride_width
 
-    return outputHeight, outputWidth
+    return output_height, output_width
 
 
 def im2col(attrs, im, col):
@@ -34,38 +34,39 @@ def im2col(attrs, im, col):
         {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
     """
     input_channels = im.shape[0]
-    inputHeight = im.shape[1]
-    inputWidth = im.shape[2]
+    input_height = im.shape[1]
+    input_width = im.shape[2]
 
-    outputHeight = col.shape[0]
-    outputWidth = col.shape[1]
-    filterHeight = col.shape[3]
-    filterWidth = col.shape[4]
+    output_height = col.shape[0]
+    output_width = col.shape[1]
+    filter_height = col.shape[3]
+    filter_width = col.shape[4]
 
-    strideHeight = attrs['strideHeight']
-    strideWidth = attrs['strideWidth']
-    paddingHeight = attrs['paddingHeight']
-    paddingWidth = attrs['paddingWidth']
+    stride_height = attrs['strideHeight']
+    stride_width = attrs['strideWidth']
+    padding_height = attrs['paddingHeight']
+    padding_width = attrs['paddingWidth']
 
-    for col_row_idx in range(0, outputHeight):
-        for col_col_idx in range(0, outputWidth):
+    for col_row_idx in range(0, output_height):
+        for col_col_idx in range(0, output_width):
             for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filterHeight):
-                    for filter_col_idx in range(0, filterWidth):
-                        im_row_offset = col_row_idx * strideHeight \
-                            + filter_row_idx - paddingHeight
+                for filter_row_idx in range(0, filter_height):
+                    for filter_col_idx in range(0, filter_width):
+                        im_row_offset = col_row_idx * stride_height \
+                            + filter_row_idx - padding_height
 
-                        im_col_offset = col_col_idx * strideWidth \
-                            + filter_col_idx - paddingWidth
+                        im_col_offset = col_col_idx * stride_width \
+                            + filter_col_idx - padding_width
 
-                        if (im_row_offset < 0 or im_row_offset >= inputHeight or
+                        if (im_row_offset < 0 or
+                                im_row_offset >= input_height or
                                 im_col_offset < 0 or
-                                im_col_offset >= inputWidth):
+                                im_col_offset >= input_width):
                             col[col_row_idx][col_col_idx][channel][\
                                 filter_row_idx][filter_col_idx] = 0.0
                         else:
-                            im_offset = (channel * inputHeight + im_row_offset \
-                                         ) * inputWidth + im_col_offset
+                            im_offset = (channel * input_height + im_row_offset \
+                                         ) * input_width + im_col_offset
 
                             col[col_row_idx][col_col_idx][channel][\
                                 filter_row_idx][filter_col_idx] = im[channel][ \
@@ -76,55 +77,55 @@ def col2img(attrs, col, img):
     """
     img: {CHW}
     col:
-        {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
+        {output_height, outputWidth, inputChannels, filterHeight, filterWidth}
     """
     input_channels = im.shape[0]
-    inputHeight = im.shape[1]
-    inputWidth = im.shape[2]
+    input_height = im.shape[1]
+    input_width = im.shape[2]
 
-    outputHeight = col.shape[0]
-    outputWidth = col.shape[1]
-    filterHeight = col.shape[3]
-    filterWidth = col.shape[4]
+    output_height = col.shape[0]
+    output_width = col.shape[1]
+    filter_height = col.shape[3]
+    filter_width = col.shape[4]
 
-    strideHeight = attrs['strideHeight']
-    strideWidth = attrs['strideWidth']
-    paddingHeight = attrs['paddingHeight']
-    paddingWidth = attrs['paddingWidth']
+    stride_height = attrs['strideHeight']
+    stride_width = attrs['strideWidth']
+    padding_height = attrs['paddingHeight']
+    padding_width = attrs['paddingWidth']
 
-    for col_row_idx in range(0, outputHeight):
-        for col_col_idx in range(0, outputWidth):
+    for col_row_idx in range(0, output_height):
+        for col_col_idx in range(0, output_width):
             for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filterHeight):
-                    for filter_col_idx in range(0, filterWidth):
+                for filter_row_idx in range(0, filter_height):
+                    for filter_col_idx in range(0, filter_width):
                         im_row_offset = \
-                            col_row_idx * strideHeight + filter_row_idx - paddingHeight
+                            col_row_idx * stride_height + filter_row_idx - padding_height
                         im_col_offset = \
-                            col_col_idx * strideWidth + filter_col_idx - paddingWidth
+                            col_col_idx * stride_width + filter_col_idx - padding_width
                         if (im_row_offset >= 0 and
-                                im_row_offset < inputHeight and
+                                im_row_offset < input_height and
                                 im_col_offset >= 0 and
-                                im_col_offset < inputWidth):
+                                im_col_offset < input_width):
                             im[channel][im_row_offset][im_col_offset] = \
                                 col[col_row_idx][col_col_idx][channel][filter_row_idx][filter_col_idx]
 
 
-class TestBlockExpandOp(OpTest):
-    def get_input_data(self, C, H, W):
-        x = np.random.uniform(0.1, 1, [C, H, W]).astype("float32")
-        for c in range(0, C):
-            for h in range(0, H):
-                for w in range(0, W):
-                    #x[c][h][w] = c * H * W + h *W + w
-                    x[c][h][w] = 0.2 + 0.01 * (c * H * W + h * W + w)
+def get_input_data(C, H, W):
+    x = np.random.uniform(0.1, 1, [C, H, W]).astype("float32")
+    for c in range(0, C):
+        for h in range(0, H):
+            for w in range(0, W):
+                #x[c][h][w] = c * H * W + h *W + w
+                x[c][h][w] = 0.2 + 0.01 * (c * H * W + h * W + w)
         return x
 
+
+class TestBlockExpandOp(OpTest):
     def setUp(self):
         C = 3
         H = 4
         W = 4
-        x = self.get_input_data(C, H, W)
-        #print x
+        x = get_input_data(C, H, W)
 
         attrs = {
             'blockHeight': 2,
@@ -135,9 +136,47 @@ class TestBlockExpandOp(OpTest):
             'paddingWidth': 1,
         }
 
-        outputHeight, outputWidth = get_output_shape(attrs, x)
+        output_height, output_width = get_output_shape(attrs, x)
+        out = np.random.uniform(0.1, 1,\
+                    [output_height, output_width, x.shape[0], \
+                     attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
+
+        self.op_type = "block_expand"
+        self.inputs = {'X': x.reshape(1, C, H, W)}
+        self.attrs = attrs
+
+        im2col(attrs, x, out)
+        self.outputs = {
+            'Out':out.reshape(1, output_height, output_width, x.shape[0], \
+                     attrs['blockHeight'], attrs['blockWidth'])
+            }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestBlockExpandOp2(OpTest):
+    def setUp(self):
+        C = 3
+        H = 4
+        W = 5
+        x = get_input_data(C, H, W)
+
+        attrs = {
+            'blockHeight': 2,
+            'blockWidth': 1,
+            'strideHeight': 2,
+            'strideWidth': 1,
+            'paddingHeight': 2,
+            'paddingWidth': 1,
+        }
+
+        output_height, output_width = get_output_shape(attrs, x)
         out = np.random.uniform(0.1, 1,\
-                    [outputHeight, outputWidth, x.shape[0], \
+                    [output_height, output_width, x.shape[0], \
                      attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
 
         self.op_type = "block_expand"
@@ -146,7 +185,7 @@ class TestBlockExpandOp(OpTest):
 
         im2col(attrs, x, out)
         self.outputs = {
-            'Out':out.reshape(1, outputHeight, outputWidth, x.shape[0], \
+            'Out':out.reshape(1, output_height, output_width, x.shape[0], \
                      attrs['blockHeight'], attrs['blockWidth'])
             }
 

From 3cf23bece3f9917eedf7b5d5548aedbfd301854d Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Fri, 12 Jan 2018 18:18:21 +0800
Subject: [PATCH 12/54] create paddle_fluid_shared.so library

---
 paddle/inference/CMakeLists.txt | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index 8437b2b219..b017283ec3 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -1,12 +1,20 @@
-set(FLUID_CORE_MODULES
-    backward proto_desc paddle_memory executor prune init ${GLOB_OP_LIB})
+set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init)
 
 cc_library(paddle_fluid_api
     SRCS inference.cc
-    DEPS ${FLUID_CORE_MODULES})
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-# Merge all modules into a simgle static library
-cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES})
+# Merge all modules into a single static library
+cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+# Create shared library
+add_library(paddle_fluid_shared SHARED inference.cc)
+
+target_circle_link_libraries(paddle_fluid_shared
+  ARCHIVE_START
+  ${GLOB_OP_LIB}
+  ARCHIVE_END
+  ${FLUID_CORE_MODULES})
 
 # ptools
 # just for testing, we may need to change the storing format for inference_model

From c0f0f2337e8374fa42097b43c3197be5bbebf699 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 16 Jan 2018 16:24:51 +0800
Subject: [PATCH 13/54] add WITH_FLUID option and third party INSTALL for fluid
 api

---
 CMakeLists.txt                |  1 +
 cmake/external/eigen.cmake    | 10 ++++++++--
 cmake/external/gflags.cmake   |  2 +-
 cmake/external/glog.cmake     |  2 +-
 cmake/external/protobuf.cmake |  2 +-
 5 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00996cb7ed..b701eb00e8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
+option(WITH_FLUID       "Compile PaddlePaddle fluid only"               ON)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index c4712f19eb..d49c8d6011 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -1,8 +1,8 @@
 INCLUDE(ExternalProject)
 
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
-
-INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
+SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
+INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
 
 ExternalProject_Add(
     extern_eigen3
@@ -28,3 +28,9 @@ endif()
 add_dependencies(eigen3 extern_eigen3)
 
 LIST(APPEND external_project_dependencies eigen3)
+
+IF(NOT WITH_C_API AND WITH_FLUID)
+    INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen)
+    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen)
+    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported)
+ENDIF()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index d4f252bb9f..6094630454 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -52,7 +52,7 @@ ADD_DEPENDENCIES(gflags extern_gflags)
 
 LIST(APPEND external_project_dependencies gflags)
 
-IF(WITH_C_API)
+IF(WITH_C_API OR WITH_FLUID)
   INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
   IF(ANDROID)
     INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 0c6b3aafcb..382fbda3b5 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -68,7 +68,7 @@ LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
 
-IF(WITH_C_API)
+IF(WITH_C_API OR WITH_FLUID)
   INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
   IF(ANDROID)
     INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index ff5855052d..365a370a9c 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -250,7 +250,7 @@ IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
         CACHE FILEPATH "protoc library." FORCE)
 
-    IF(WITH_C_API)
+    IF(WITH_C_API OR WITH_FLUID)
         INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
         IF(ANDROID)
             INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})

From 2be7cf909a6680cc53151b2fe422dddbef13f8da Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 16 Jan 2018 18:30:25 +0800
Subject: [PATCH 14/54] add paddle INSTALL for fluid api

---
 paddle/framework/CMakeLists.txt | 6 ++++++
 paddle/inference/CMakeLists.txt | 6 ++++++
 paddle/memory/CMakeLists.txt    | 7 +++++++
 paddle/platform/CMakeLists.txt  | 8 ++++++++
 paddle/string/CMakeLists.txt    | 7 ++++++-
 5 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 597ea959f2..fcfac5a3e6 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -84,3 +84,9 @@ cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_contex
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
         DEPS operator op_registry init math_function)
+      
+if(NOT WITH_C_API AND WITH_FLUID)
+  file(GLOB FRAMEWORK_HEADERS *.h)
+  install(FILES ${FRAMEWORK_HEADERS} DESTINATION include/paddle/framework)
+  install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
+endif()
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index b017283ec3..af882f252b 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -16,6 +16,12 @@ target_circle_link_libraries(paddle_fluid_shared
   ARCHIVE_END
   ${FLUID_CORE_MODULES})
 
+# install library & headers
+if(NOT WITH_C_API AND WITH_FLUID)
+  install(FILES inference.h DESTINATION include/paddle/inference)
+  install(TARGETS paddle_fluid_shared DESTINATION lib)
+endif()
+
 # ptools
 # just for testing, we may need to change the storing format for inference_model
 # and move the dependent of pickle.
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 8841c14ee0..061ee1a4d4 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -14,3 +14,10 @@ cc_library(paddle_memory
     system_allocator)
 
 cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
+
+if(NOT WITH_C_API AND WITH_FLUID)
+  file(GLOB MEMORY_HEADERS *.h)
+  file(GLOB MEMORY_DETAIL_HEADERS detail/*.h)
+  install(FILES ${MEMORY_HEADERS} DESTINATION include/paddle/memory)
+  install(FILES ${MEMORY_DETAIL_HEADERS} DESTINATION include/paddle/memory/detail)
+endif()
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 44f6d85cd1..3742594a50 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -39,3 +39,11 @@ nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
 
 cc_library(profiler SRCS profiler.cc DEPS device_context)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
+
+if(NOT WITH_C_API AND WITH_FLUID)
+  file(GLOB PLATFORM_HEADERS *.h)
+  file(GLOB PLATFORM_dynload_HEADERS dynload/*.h)
+  install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform)
+  install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform/dynload)
+  install(FILES details/device_ptr_cast.h DESTINATION include/paddle/platform/details)
+endif()
diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
index 60667b7287..0fa846c4ed 100644
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -1,5 +1,10 @@
 cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
-
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
+
+if(NOT WITH_C_API AND WITH_FLUID)
+  file(GLOB STRING_HEADERS *.h)
+  install(FILES ${STRING_HEADERS} DESTINATION include/paddle/memory)
+  install(FILES tinyformat/tinyformat.h DESTINATION include/paddle/memory/tinyformat)
+endif()

From 363538803a5138d3a554e34f30faf9c99156a0ef Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 16 Jan 2018 19:07:37 +0800
Subject: [PATCH 15/54] set WITH_FLUID=OFF when WITH_C_API=ON

---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b701eb00e8..ad1b6f23c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,6 +108,10 @@ if (WITH_C_API AND WITH_PYTHON)
     "different Python interpreter from compiling.")
 endif()
 
+if (WITH_C_API)
+  set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
+endif()
+
 if(MOBILE_INFERENCE)
     set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
 else()

From e82f1008a82232936529ec4bba70a59880915912 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 17 Jan 2018 00:42:20 +0800
Subject: [PATCH 16/54] Finish block expand op 1. Add lod to output 2. Fix
 im2col arguments list 3. Refine code and doc 4. Fix output shape

---
 paddle/operators/block_expand_op.cc           | 119 +++++++----
 paddle/operators/block_expand_op.h            | 140 ++++++------
 .../v2/fluid/tests/test_block_expand_op.py    | 202 ++++++++----------
 3 files changed, 239 insertions(+), 222 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index f25cc4f9de..317a43bb7b 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -32,37 +32,27 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     auto in_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(in_dim.size(), 4,
                       "Input(X) format  must be 4D tensor, eg., NCHW.");
-    PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
-    int block_height = ctx->Attrs().Get<int>("blockHeight");
-    int block_width = ctx->Attrs().Get<int>("blockWidth");
-    int stride_height = ctx->Attrs().Get<int>("strideHeight");
-    int stride_width = ctx->Attrs().Get<int>("strideWidth");
-    int padding_height = ctx->Attrs().Get<int>("paddingHeight");
-    int padding_width = ctx->Attrs().Get<int>("paddingWidth");
+    int block_height = ctx->Attrs().Get<int>("block_height");
+    int block_width = ctx->Attrs().Get<int>("block_width");
+    int stride_height = ctx->Attrs().Get<int>("stride_height");
+    int stride_width = ctx->Attrs().Get<int>("stride_width");
+    int padding_height = ctx->Attrs().Get<int>("padding_height");
+    int padding_width = ctx->Attrs().Get<int>("padding_width");
 
-    int N = in_dim[0];
-    int C = in_dim[1];
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    int output_height = 0;
-    int output_width = 0;
+    int output_height = get_output_size(img_height, block_height, stride_height,
+                                        padding_height);
+    int output_width =
+        get_output_size(img_width, block_width, stride_width, padding_width);
 
-    get_blockexpand_output_shape(img_height, img_width, block_height,
-                                 block_width, stride_height, stride_width,
-                                 padding_height, padding_width, output_height,
-                                 output_width);
-
-    // The result of im2col is [output_height, output_width,
-    // inputChannels, filterHeight, filterWidth], and it is easy to
-    // reshape into [seqLength, stepSize], where seqLength is equal
-    // output_height * output_width, stepSize is equal
-    // input_channels * blockHeight * blockWidth
-    ctx->SetOutputDim(
-        "Out", {N, output_height, output_width, C, block_height, block_width});
-
-    // ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
+                              img_channels * block_height * block_width});
+    // TODO(wanghaoshuang): cal lod in complie time
   }
 };
 
@@ -79,28 +69,69 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     W: width
 )DOC");
     AddOutput("Out", "(LodTensor)The output data of block_expand op,");
-    AddAttr<int>("blockHeight", "(int)height of block.");
-    AddAttr<int>("blockWidth", "(int)width of block.");
-    AddAttr<int>("strideHeight", "(int)height of stride.");
-    AddAttr<int>("strideWidth", "(int)width of stride.");
-    AddAttr<int>("paddingHeight", "(int)height of padding.");
-    AddAttr<int>("paddingWidth", "(int)width of padding.");
+    AddAttr<int>("block_height", "(int)height of block.");
+    AddAttr<int>("block_width", "(int)width of block.");
+    AddAttr<int>("stride_height", "(int)height of stride.");
+    AddAttr<int>("stride_width", "(int)width of stride.");
+    AddAttr<int>("padding_height", "(int)height of padding.");
+    AddAttr<int>("padding_width", "(int)width of padding.");
     AddComment(R"DOC(
 Expand feature map to minibatch matrix.
 - matirx height is: output_height * output_width
-- matrix width is: blockHeight * blockWidth * channels
-
-output_height = 
-    1 + (2 * paddingHeight + img_height - blockHeight + strideHeight - 1) /
-            strideHeight;
-output_width = 
-    1 + (2 * paddingWidth + img_width - blockWidth + strideWidth - 1) /
-            strideWidth;
-
-The expand method is the same with ExpandConvLayer, but saved the transposed
-value. After expanding, The number of time steps are output_height * output_width
-and the dimension of each time step is blockHeight * blockWidth * channels.
-This layer can be used after convolution neural network, and before recurrent neural network.
+- matrix width is: block_height * block_width * channels
+
+output_height =
+    1 + (2 * padding_height + img_height - block_height + stride_height - 1) /
+            stride_height;
+output_width =
+    1 + (2 * padding_width + img_width - block_width + stride_width - 1) /
+            stride_width;
+
+After expanding, The number of time steps are output_height * output_width
+and the dimension of each time step is block_height * block_width * channels.
+This op can be used after convolution neural network, and before recurrent neural network.
+
+Given:
+
+x = [[[[ 6.  2.  1.]
+       [ 8.  3.  5.]
+       [ 0.  2.  6.]]
+
+      [[ 2.  4.  4.]
+       [ 6.  3.  0.]
+       [ 6.  4.  7.]]]
+
+     [[[ 6.  7.  1.]
+       [ 5.  7.  9.]
+       [ 2.  4.  8.]]
+
+      [[ 1.  2.  1.]
+       [ 1.  3.  5.]
+       [ 9.  0.  8.]]]]
+x.dims = {2, 2, 3, 3}
+
+And:
+
+block_height = 2
+block_width = 2
+stride_height = 1
+stride_width = 1
+padding_height = 0
+padding_width = 0
+
+Then:
+
+output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
+               [ 2.  1.  3.  5.  4.  4.  3.  0.]
+               [ 8.  3.  0.  2.  6.  3.  6.  4.]
+               [ 3.  5.  2.  6.  3.  0.  4.  7.]
+               [ 6.  7.  5.  7.  1.  2.  1.  3.]
+               [ 7.  1.  7.  9.  2.  1.  3.  5.]
+               [ 5.  7.  2.  4.  1.  3.  9.  0.]
+               [ 7.  9.  4.  8.  3.  5.  0.  8.]]
+output.dims = {8, 9}
+output.lod = [[0, 4, 8]]
+
 )DOC");
   }
 };
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index aa0db2705c..022dc3a123 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -23,20 +23,9 @@
 namespace paddle {
 namespace operators {
 
-inline void get_blockexpand_output_shape(int img_height, int img_width,
-                                         int block_height, int block_width,
-                                         int stride_height, int stride_width,
-                                         int padding_height, int padding_width,
-                                         int& outputHeight, int& outputWidth) {
-  outputHeight =
-      1 +
-      (img_height + 2 * padding_height - block_height + stride_height - 1) /
-          stride_height;
-
-  outputWidth =
-      1 +
-      (img_width + 2 * padding_width - block_width + stride_width - 1) /
-          stride_width;
+inline int get_output_size(int img_size, int block_size, int stride,
+                           int padding) {
+  return (1 + (img_size + 2 * padding - block_size + stride - 1) / stride);
 }
 
 template <typename Place, typename T>
@@ -45,40 +34,54 @@ class BlockExpandKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
     const Tensor* in = ctx.Input<Tensor>("X");
-    Tensor* out = ctx.Output<Tensor>("Out");
+    LoDTensor* out = ctx.Output<LoDTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     auto in_dim = in->dims();
-    int N = in_dim[0];
-    int C = in_dim[1];
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
-
-    int block_height = ctx.Attr<int>("blockHeight");
-    int block_width = ctx.Attr<int>("blockWidth");
-    int stride_height = ctx.Attr<int>("strideHeight");
-    int stride_width = ctx.Attr<int>("strideWidth");
-    int padding_height = ctx.Attr<int>("paddingHeight");
-    int padding_width = ctx.Attr<int>("paddingWidth");
-
-    int outputHeight = 0;
-    int outputWidth = 0;
-
-    get_blockexpand_output_shape(
-        img_height, img_width, block_height, block_width, stride_height,
-        stride_width, padding_height, padding_width, outputHeight, outputWidth);
-
-    std::vector<int> stride({stride_height, stride_width});
-    std::vector<int> padding({padding_height, padding_width});
-
-    for (int i = 0; i < N; i++) {
-      Tensor src = in->Slice(i, i + 1).Resize({C, img_height, img_width});
-      Tensor dst = out->Slice(i, i + 1).Resize(
-          {outputHeight, outputWidth, C, block_height, block_width});
+    int block_height = ctx.Attr<int>("block_height");
+    int block_width = ctx.Attr<int>("block_width");
+    int stride_height = ctx.Attr<int>("stride_height");
+    int stride_width = ctx.Attr<int>("stride_width");
+    int padding_height = ctx.Attr<int>("padding_height");
+    int padding_width = ctx.Attr<int>("padding_width");
+
+    int output_height = get_output_size(img_height, block_height, stride_height,
+                                        padding_height);
+    int output_width =
+        get_output_size(img_width, block_width, stride_width, padding_width);
+
+    const std::vector<int> dilations({1, 1});
+    const std::vector<int> strides(
+        {stride_height, stride_width, stride_height, stride_width});
+    const std::vector<int> paddings(
+        {padding_height, padding_width, padding_height, padding_width});
+
+    auto out_dims = out->dims();
+    out->Resize({batch_size, out->numel() / batch_size});
+    for (int i = 0; i < batch_size; i++) {
+      const Tensor src =
+          in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+      Tensor dst = out->Slice(i, i + 1).Resize({output_height, output_width,
+                                                img_channels, block_height,
+                                                block_width});
 
       math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), src, stride, padding, &dst);
+      f(ctx.device_context(), src, dilations, strides, paddings, &dst);
     }
+    out->Resize(out_dims);
+
+    // set lod information
+    // TODO(wanghaoshuang): Move this to InferShape
+    framework::LoD lod(1);
+    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
+      lod[0].push_back(offset);
+      offset += output_height * output_width;
+    }
+    out->set_lod(lod);
   }
 };
 
@@ -88,7 +91,8 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
     auto* in = ctx.Input<Tensor>("X");
-    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* d_out =
+        const_cast<Tensor*>(ctx.Input<Tensor>(framework::GradVarName("Out")));
     auto* d_x = ctx.Output<Tensor>(GradVarName("X"));
     d_x->mutable_data<T>(ctx.GetPlace());
 
@@ -96,36 +100,40 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
     x_v.device(ctx.GetEigenDevice<Place>()) = x_v.constant(0.0);
 
     auto in_dim = in->dims();
-    int N = in_dim[0];
-    int C = in_dim[1];
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    int block_height = ctx.Attr<int>("blockHeight");
-    int block_width = ctx.Attr<int>("blockWidth");
-    int stride_height = ctx.Attr<int>("strideHeight");
-    int stride_width = ctx.Attr<int>("strideWidth");
-    int padding_height = ctx.Attr<int>("paddingHeight");
-    int padding_width = ctx.Attr<int>("paddingWidth");
-
-    int outputHeight = 0;
-    int outputWidth = 0;
-
-    get_blockexpand_output_shape(
-        img_height, img_width, block_height, block_width, stride_height,
-        stride_width, padding_height, padding_width, outputHeight, outputWidth);
-
-    std::vector<int> stride({stride_height, stride_width});
-    std::vector<int> padding({padding_height, padding_width});
-    // std::vector<int> stride({stride_height, stride_width});
-
-    for (int i = 0; i < N; i++) {
-      Tensor dst = d_x->Slice(i, i + 1).Resize({C, img_height, img_width});
-      Tensor src = d_out->Slice(i, i + 1).Resize(
-          {outputHeight, outputWidth, C, block_height, block_width});
+    int block_height = ctx.Attr<int>("block_height");
+    int block_width = ctx.Attr<int>("block_width");
+    int stride_height = ctx.Attr<int>("stride_height");
+    int stride_width = ctx.Attr<int>("stride_width");
+    int padding_height = ctx.Attr<int>("padding_height");
+    int padding_width = ctx.Attr<int>("padding_width");
+    int output_height = get_output_size(img_height, block_height, stride_height,
+                                        padding_height);
+    int output_width =
+        get_output_size(img_width, block_width, stride_width, padding_width);
+
+    const std::vector<int> dilations({1, 1});
+    const std::vector<int> strides(
+        {stride_height, stride_width, stride_height, stride_width});
+    const std::vector<int> paddings(
+        {padding_height, padding_width, padding_height, padding_width});
+
+    auto d_out_dims = d_out->dims();
+    d_out->Resize({batch_size, d_out->numel() / batch_size});
+    for (int i = 0; i < batch_size; i++) {
+      Tensor dst =
+          d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+      const Tensor src = d_out->Slice(i, i + 1).Resize(
+          {output_height, output_width, img_channels, block_height,
+           block_width});
       math::Col2ImFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), dst, stride, padding, &src);
+      f(ctx.device_context(), src, dilations, strides, paddings, &dst);
     }
+    d_out->Resize(d_out_dims);
   }
 };
 
diff --git a/python/paddle/v2/fluid/tests/test_block_expand_op.py b/python/paddle/v2/fluid/tests/test_block_expand_op.py
index b31ed53f4c..424bc7dc6e 100644
--- a/python/paddle/v2/fluid/tests/test_block_expand_op.py
+++ b/python/paddle/v2/fluid/tests/test_block_expand_op.py
@@ -4,20 +4,20 @@ from op_test import OpTest
 
 
 def get_output_shape(attrs, x):
-    img_height = x.shape[1]
-    img_width = x.shape[2]
+    img_height = x.shape[2]
+    img_width = x.shape[3]
 
-    padding_height = attrs['paddingHeight']
-    padding_width = attrs['paddingWidth']
-    block_height = attrs['blockHeight']
-    block_width = attrs['blockWidth']
-    stride_height = attrs['strideHeight']
-    stride_width = attrs['strideWidth']
+    padding_height = attrs['padding_height']
+    padding_width = attrs['padding_width']
+    block_height = attrs['block_height']
+    block_width = attrs['block_width']
+    stride_height = attrs['stride_height']
+    stride_width = attrs['stride_width']
 
     output_height = \
       1 +  \
       (img_height + 2 * padding_height - block_height + stride_height - 1) / \
-          strideHeight
+          stride_height
 
     output_width = \
       1 + \
@@ -42,10 +42,10 @@ def im2col(attrs, im, col):
     filter_height = col.shape[3]
     filter_width = col.shape[4]
 
-    stride_height = attrs['strideHeight']
-    stride_width = attrs['strideWidth']
-    padding_height = attrs['paddingHeight']
-    padding_width = attrs['paddingWidth']
+    stride_height = attrs['stride_height']
+    stride_width = attrs['stride_width']
+    padding_height = attrs['padding_height']
+    padding_width = attrs['padding_width']
 
     for col_row_idx in range(0, output_height):
         for col_col_idx in range(0, output_width):
@@ -73,83 +73,51 @@ def im2col(attrs, im, col):
                                     im_row_offset][im_col_offset]
 
 
-def col2img(attrs, col, img):
-    """
-    img: {CHW}
-    col:
-        {output_height, outputWidth, inputChannels, filterHeight, filterWidth}
-    """
-    input_channels = im.shape[0]
-    input_height = im.shape[1]
-    input_width = im.shape[2]
-
-    output_height = col.shape[0]
-    output_width = col.shape[1]
-    filter_height = col.shape[3]
-    filter_width = col.shape[4]
+def block_expand(inputs, attrs):
+    output_height, output_width = get_output_shape(attrs, inputs)
+    img_channels = inputs.shape[1]
+    batch_size = inputs.shape[0]
+    out = np.zeros([
+        batch_size, output_height, output_width, img_channels,
+        attrs['block_height'], attrs['block_width']
+    ]).astype("float32")
 
-    stride_height = attrs['strideHeight']
-    stride_width = attrs['strideWidth']
-    padding_height = attrs['paddingHeight']
-    padding_width = attrs['paddingWidth']
+    for i in range(len(inputs)):
+        im2col(attrs, inputs[i], out[i])
 
-    for col_row_idx in range(0, output_height):
-        for col_col_idx in range(0, output_width):
-            for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filter_height):
-                    for filter_col_idx in range(0, filter_width):
-                        im_row_offset = \
-                            col_row_idx * stride_height + filter_row_idx - padding_height
-                        im_col_offset = \
-                            col_col_idx * stride_width + filter_col_idx - padding_width
-                        if (im_row_offset >= 0 and
-                                im_row_offset < input_height and
-                                im_col_offset >= 0 and
-                                im_col_offset < input_width):
-                            im[channel][im_row_offset][im_col_offset] = \
-                                col[col_row_idx][col_col_idx][channel][filter_row_idx][filter_col_idx]
-
-
-def get_input_data(C, H, W):
-    x = np.random.uniform(0.1, 1, [C, H, W]).astype("float32")
-    for c in range(0, C):
-        for h in range(0, H):
-            for w in range(0, W):
-                #x[c][h][w] = c * H * W + h *W + w
-                x[c][h][w] = 0.2 + 0.01 * (c * H * W + h * W + w)
-        return x
+    out = out.reshape([
+        batch_size * output_height * output_width,
+        img_channels * attrs['block_height'] * attrs['block_width']
+    ])
+    return out
 
 
 class TestBlockExpandOp(OpTest):
-    def setUp(self):
-        C = 3
-        H = 4
-        W = 4
-        x = get_input_data(C, H, W)
-
-        attrs = {
-            'blockHeight': 2,
-            'blockWidth': 2,
-            'strideHeight': 1,
-            'strideWidth': 1,
-            'paddingHeight': 1,
-            'paddingWidth': 1,
+    def config(self):
+        self.batch_size = 1
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 4
+        self.attrs = {
+            'block_height': 2,
+            'block_width': 2,
+            'stride_height': 1,
+            'stride_width': 1,
+            'padding_height': 1,
+            'padding_width': 1,
         }
 
-        output_height, output_width = get_output_shape(attrs, x)
-        out = np.random.uniform(0.1, 1,\
-                    [output_height, output_width, x.shape[0], \
-                     attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
-
+    def setUp(self):
+        self.config()
         self.op_type = "block_expand"
-        self.inputs = {'X': x.reshape(1, C, H, W)}
-        self.attrs = attrs
+        #x = np.random.uniform(0.1, 1,
+        x = np.random.randint(0, 10, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
 
-        im2col(attrs, x, out)
-        self.outputs = {
-            'Out':out.reshape(1, output_height, output_width, x.shape[0], \
-                     attrs['blockHeight'], attrs['blockWidth'])
-            }
+        out = block_expand(x, self.attrs)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
@@ -158,42 +126,52 @@ class TestBlockExpandOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestBlockExpandOp2(OpTest):
-    def setUp(self):
-        C = 3
-        H = 4
-        W = 5
-        x = get_input_data(C, H, W)
-
-        attrs = {
-            'blockHeight': 2,
-            'blockWidth': 1,
-            'strideHeight': 2,
-            'strideWidth': 1,
-            'paddingHeight': 2,
-            'paddingWidth': 1,
+class TestBlockExpandOpCase2(TestBlockExpandOp):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'block_height': 2,
+            'block_width': 1,
+            'stride_height': 2,
+            'stride_width': 1,
+            'padding_height': 2,
+            'padding_width': 1,
         }
 
-        output_height, output_width = get_output_shape(attrs, x)
-        out = np.random.uniform(0.1, 1,\
-                    [output_height, output_width, x.shape[0], \
-                     attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
-
-        self.op_type = "block_expand"
-        self.inputs = {'X': x.reshape(1, C, H, W)}
-        self.attrs = attrs
 
-        im2col(attrs, x, out)
-        self.outputs = {
-            'Out':out.reshape(1, output_height, output_width, x.shape[0], \
-                     attrs['blockHeight'], attrs['blockWidth'])
-            }
+class TestBlockExpandOpCase3(TestBlockExpandOp):
+    def config(self):
+        self.batch_size = 3
+        self.img_channels = 1
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'block_height': 2,
+            'block_width': 1,
+            'stride_height': 2,
+            'stride_width': 1,
+            'padding_height': 2,
+            'padding_width': 0,
+        }
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
+class TestBlockExpandOpCase4(TestBlockExpandOp):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 2
+        self.img_height = 3
+        self.img_width = 3
+        self.attrs = {
+            'block_height': 2,
+            'block_width': 2,
+            'stride_height': 1,
+            'stride_width': 1,
+            'padding_height': 0,
+            'padding_width': 0,
+        }
 
 
 if __name__ == '__main__':

From 92baa885a0bd6f752fbda290aa69b698b90bd53a Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 17 Jan 2018 09:56:17 +0800
Subject: [PATCH 17/54] Fix code style

---
 paddle/operators/block_expand_op.cc | 2 --
 paddle/operators/block_expand_op.h  | 7 ++++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index 317a43bb7b..bef82183b8 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -23,7 +23,6 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    using namespace framework;
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input of BlockExpandOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -142,7 +141,6 @@ class BlockExpandGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    using namespace framework;
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) shouldn't be null.");
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 022dc3a123..2e4f0cb6f1 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -23,6 +23,9 @@
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
 inline int get_output_size(int img_size, int block_size, int stride,
                            int padding) {
   return (1 + (img_size + 2 * padding - block_size + stride - 1) / stride);
@@ -32,7 +35,6 @@ template <typename Place, typename T>
 class BlockExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using namespace framework;
     const Tensor* in = ctx.Input<Tensor>("X");
     LoDTensor* out = ctx.Output<LoDTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
@@ -89,11 +91,10 @@ template <typename Place, typename T>
 class BlockExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using namespace framework;
     auto* in = ctx.Input<Tensor>("X");
     Tensor* d_out =
         const_cast<Tensor*>(ctx.Input<Tensor>(framework::GradVarName("Out")));
-    auto* d_x = ctx.Output<Tensor>(GradVarName("X"));
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto x_v = framework::EigenVector<T>::Flatten(*d_x);

From 09adb769037b34fbe8a50fd48bc3284f13456f3a Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 17 Jan 2018 11:15:54 +0800
Subject: [PATCH 18/54] Fix code style

---
 paddle/operators/block_expand_op.cc | 21 ++++++++++-----------
 paddle/operators/block_expand_op.cu |  9 +++++----
 paddle/operators/block_expand_op.h  | 17 ++++++++++-------
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index bef82183b8..f9b75ffee7 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -57,16 +57,14 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
 class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BlockExpandOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  BlockExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", R"DOC(
-(Tensor)The input tensor has NCHW format.
-    N: batch size
-    C: channels
-    H: height
-    W: width
-)DOC");
+    AddInput("X",
+             "(Tensor)The input tensor has NCHW format."
+             "N: batch size"
+             "C: channels"
+             "H: height"
+             "W: width");
     AddOutput("Out", "(LodTensor)The output data of block_expand op,");
     AddAttr<int>("block_height", "(int)height of block.");
     AddAttr<int>("block_width", "(int)width of block.");
@@ -155,7 +153,8 @@ namespace ops = paddle::operators;
 REGISTER_OP(block_expand, ops::BlockExpandOp, ops::BlockExpandOpMaker,
             block_expand_grad, ops::BlockExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
-    block_expand, ops::BlockExpandKernel<paddle::platform::CPUPlace, float>);
+    block_expand,
+    ops::BlockExpandKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     block_expand_grad,
-    ops::BlockExpandGradKernel<paddle::platform::CPUPlace, float>);
+    ops::BlockExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/block_expand_op.cu b/paddle/operators/block_expand_op.cu
index 492ac0c9b2..c17b113807 100644
--- a/paddle/operators/block_expand_op.cu
+++ b/paddle/operators/block_expand_op.cu
@@ -17,8 +17,9 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
-    block_expand, ops::BlockExpandKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    block_expand,
+    ops::BlockExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     block_expand_grad,
-    ops::BlockExpandGradKernel<paddle::platform::GPUPlace, float>);
+    ops::BlockExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 2e4f0cb6f1..72760fb23c 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -31,7 +31,7 @@ inline int get_output_size(int img_size, int block_size, int stride,
   return (1 + (img_size + 2 * padding - block_size + stride - 1) / stride);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BlockExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -71,8 +71,9 @@ class BlockExpandKernel : public framework::OpKernel<T> {
                                                 img_channels, block_height,
                                                 block_width});
 
-      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), src, dilations, strides, paddings, &dst);
+      math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      f(dev_ctx, src, dilations, strides, paddings, &dst);
     }
     out->Resize(out_dims);
 
@@ -87,7 +88,7 @@ class BlockExpandKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BlockExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -98,7 +99,8 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto x_v = framework::EigenVector<T>::Flatten(*d_x);
-    x_v.device(ctx.GetEigenDevice<Place>()) = x_v.constant(0.0);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    x_v.device(place) = x_v.constant(0.0);
 
     auto in_dim = in->dims();
     int batch_size = in_dim[0];
@@ -131,8 +133,9 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
       const Tensor src = d_out->Slice(i, i + 1).Resize(
           {output_height, output_width, img_channels, block_height,
            block_width});
-      math::Col2ImFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), src, dilations, strides, paddings, &dst);
+      math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      f(dev_ctx, src, dilations, strides, paddings, &dst);
     }
     d_out->Resize(d_out_dims);
   }

From fe45f2115ff37cb4ec2ee2bd53692b5eaa422613 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 17 Jan 2018 19:01:11 +0800
Subject: [PATCH 19/54] 1. Rename 'block_expand' to im2sequence 2. Refine code
 and doc

---
 .../{block_expand_op.cc => im2sequence_op.cc} | 30 +++++++++----------
 .../{block_expand_op.cu => im2sequence_op.cu} | 10 +++----
 .../{block_expand_op.h => im2sequence_op.h}   | 16 ++++++----
 ...ck_expand_op.py => test_im2sequence_op.py} | 30 +++++++++++++------
 4 files changed, 51 insertions(+), 35 deletions(-)
 rename paddle/operators/{block_expand_op.cc => im2sequence_op.cc} (84%)
 rename paddle/operators/{block_expand_op.cu => im2sequence_op.cu} (77%)
 rename paddle/operators/{block_expand_op.h => im2sequence_op.h} (92%)
 rename python/paddle/v2/fluid/tests/{test_block_expand_op.py => test_im2sequence_op.py} (85%)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/im2sequence_op.cc
similarity index 84%
rename from paddle/operators/block_expand_op.cc
rename to paddle/operators/im2sequence_op.cc
index f9b75ffee7..9b2397bdc8 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/im2sequence_op.cc
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/block_expand_op.h"
+#include "paddle/operators/im2sequence_op.h"
 
 namespace paddle {
 namespace operators {
 
-class BlockExpandOp : public framework::OperatorWithKernel {
+class Im2SequenceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input of BlockExpandOp should not be null.");
+                   "Input(X) of Im2SequenceOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output of BlockExpandOp op should not be null.");
+                   "Output(Out) of Im2SequenceOp op should not be null.");
 
     auto in_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(in_dim.size(), 4,
@@ -55,9 +55,9 @@ class BlockExpandOp : public framework::OperatorWithKernel {
   }
 };
 
-class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BlockExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+  Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor)The input tensor has NCHW format."
@@ -65,7 +65,7 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
              "C: channels"
              "H: height"
              "W: width");
-    AddOutput("Out", "(LodTensor)The output data of block_expand op,");
+    AddOutput("Out", "(LodTensor)The output data of im2sequence op,");
     AddAttr<int>("block_height", "(int)height of block.");
     AddAttr<int>("block_width", "(int)width of block.");
     AddAttr<int>("stride_height", "(int)height of stride.");
@@ -73,7 +73,7 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("padding_height", "(int)height of padding.");
     AddAttr<int>("padding_width", "(int)width of padding.");
     AddComment(R"DOC(
-Expand feature map to minibatch matrix.
+Convert feature map to minibatch matrix.
 - matirx height is: output_height * output_width
 - matrix width is: block_height * block_width * channels
 
@@ -133,7 +133,7 @@ output.lod = [[0, 4, 8]]
   }
 };
 
-class BlockExpandGradOp : public framework::OperatorWithKernel {
+class Im2SequenceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -150,11 +150,11 @@ class BlockExpandGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(block_expand, ops::BlockExpandOp, ops::BlockExpandOpMaker,
-            block_expand_grad, ops::BlockExpandGradOp);
+REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
+            im2sequence_grad, ops::Im2SequenceGradOp);
 REGISTER_OP_CPU_KERNEL(
-    block_expand,
-    ops::BlockExpandKernel<paddle::platform::CPUDeviceContext, float>);
+    im2sequence,
+    ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
-    block_expand_grad,
-    ops::BlockExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
+    im2sequence_grad,
+    ops::Im2SequenceGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/block_expand_op.cu b/paddle/operators/im2sequence_op.cu
similarity index 77%
rename from paddle/operators/block_expand_op.cu
rename to paddle/operators/im2sequence_op.cu
index c17b113807..9db7529112 100644
--- a/paddle/operators/block_expand_op.cu
+++ b/paddle/operators/im2sequence_op.cu
@@ -13,13 +13,13 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/block_expand_op.h"
+#include "paddle/operators/im2sequence_op.h"
 
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
-    block_expand,
-    ops::BlockExpandKernel<paddle::platform::CUDADeviceContext, float>);
+    im2sequence,
+    ops::Im2SequenceKernel<paddle::platform::CUDADeviceContext, float>);
 REGISTER_OP_CUDA_KERNEL(
-    block_expand_grad,
-    ops::BlockExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
+    im2sequence_grad,
+    ops::Im2SequenceGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/im2sequence_op.h
similarity index 92%
rename from paddle/operators/block_expand_op.h
rename to paddle/operators/im2sequence_op.h
index 72760fb23c..85d6cac444 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/im2sequence_op.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/operators/math/math_function.h"
-
+#include "paddle/framework/data_layout.h"
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -32,13 +32,16 @@ inline int get_output_size(int img_size, int block_size, int stride,
 }
 
 template <typename DeviceContext, typename T>
-class BlockExpandKernel : public framework::OpKernel<T> {
+class Im2SequenceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const Tensor* in = ctx.Input<Tensor>("X");
     LoDTensor* out = ctx.Output<LoDTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
-
+    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
+    // being available for python API
+    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
+    //                  "Input(X) layout must be NCHW");
     auto in_dim = in->dims();
     int batch_size = in_dim[0];
     int img_channels = in_dim[1];
@@ -80,8 +83,9 @@ class BlockExpandKernel : public framework::OpKernel<T> {
     // set lod information
     // TODO(wanghaoshuang): Move this to InferShape
     framework::LoD lod(1);
+    lod[0].reserve(batch_size + 1);
     for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
-      lod[0].push_back(offset);
+      lod[0][i] = offset;
       offset += output_height * output_width;
     }
     out->set_lod(lod);
@@ -89,7 +93,7 @@ class BlockExpandKernel : public framework::OpKernel<T> {
 };
 
 template <typename DeviceContext, typename T>
-class BlockExpandGradKernel : public framework::OpKernel<T> {
+class Im2SequenceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<Tensor>("X");
diff --git a/python/paddle/v2/fluid/tests/test_block_expand_op.py b/python/paddle/v2/fluid/tests/test_im2sequence_op.py
similarity index 85%
rename from python/paddle/v2/fluid/tests/test_block_expand_op.py
rename to python/paddle/v2/fluid/tests/test_im2sequence_op.py
index 424bc7dc6e..cd1b2164f0 100644
--- a/python/paddle/v2/fluid/tests/test_block_expand_op.py
+++ b/python/paddle/v2/fluid/tests/test_im2sequence_op.py
@@ -1,11 +1,24 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
 
 
-def get_output_shape(attrs, x):
-    img_height = x.shape[2]
-    img_width = x.shape[3]
+def get_output_shape(attrs, in_shape):
+    img_height = in_shape[2]
+    img_width = in_shape[3]
 
     padding_height = attrs['padding_height']
     padding_width = attrs['padding_width']
@@ -73,8 +86,8 @@ def im2col(attrs, im, col):
                                     im_row_offset][im_col_offset]
 
 
-def block_expand(inputs, attrs):
-    output_height, output_width = get_output_shape(attrs, inputs)
+def Im2Sequence(inputs, attrs):
+    output_height, output_width = get_output_shape(attrs, inputs.shape)
     img_channels = inputs.shape[1]
     batch_size = inputs.shape[0]
     out = np.zeros([
@@ -109,13 +122,12 @@ class TestBlockExpandOp(OpTest):
 
     def setUp(self):
         self.config()
-        self.op_type = "block_expand"
-        #x = np.random.uniform(0.1, 1,
-        x = np.random.randint(0, 10, [
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
             self.batch_size, self.img_channels, self.img_height, self.img_width
         ]).astype("float32")
 
-        out = block_expand(x, self.attrs)
+        out = Im2Sequence(x, self.attrs)
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
 

From c96b7e8047a9c70a48f9f8e1077187469edfea3e Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 17 Jan 2018 19:01:37 +0800
Subject: [PATCH 20/54] add missing framework.pb.h and fix string install typo

---
 paddle/framework/CMakeLists.txt | 1 +
 paddle/string/CMakeLists.txt    | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index fcfac5a3e6..c514e41e13 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -88,5 +88,6 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
 if(NOT WITH_C_API AND WITH_FLUID)
   file(GLOB FRAMEWORK_HEADERS *.h)
   install(FILES ${FRAMEWORK_HEADERS} DESTINATION include/paddle/framework)
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
   install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
 endif()
diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
index 0fa846c4ed..751776dbb5 100644
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -5,6 +5,6 @@ cc_test(to_string_test SRCS to_string_test.cc)
 
 if(NOT WITH_C_API AND WITH_FLUID)
   file(GLOB STRING_HEADERS *.h)
-  install(FILES ${STRING_HEADERS} DESTINATION include/paddle/memory)
-  install(FILES tinyformat/tinyformat.h DESTINATION include/paddle/memory/tinyformat)
+  install(FILES ${STRING_HEADERS} DESTINATION include/paddle/string)
+  install(FILES tinyformat/tinyformat.h DESTINATION include/paddle/string/tinyformat)
 endif()

From 0dd3919a21ee28942821504bb3b8ee2b205bb3ec Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 18 Jan 2018 10:58:07 +0800
Subject: [PATCH 21/54] Add python wrapper for ctc_evaluator

---
 python/paddle/v2/fluid/layers/nn.py | 49 +++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 4e8fd407c9..8572b422e5 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -50,6 +50,7 @@ __all__ = [
     'sequence_last_step',
     'dropout',
     'split',
+    'greedy_ctc_evaluator',
 ]
 
 
@@ -1547,13 +1548,13 @@ def split(input, num_or_sections, dim=-1):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        num_or_sections (int|list): If :attr:`num_or_sections` is an integer, 
-            then the integer indicates the number of equal sized sub-tensors 
-            that the tensor will be divided into. If :attr:`num_or_sections` 
-            is a list of integers, the length of list indicates the number of 
-            sub-tensors and the integers indicate the sizes of sub-tensors' 
+        num_or_sections (int|list): If :attr:`num_or_sections` is an integer,
+            then the integer indicates the number of equal sized sub-tensors
+            that the tensor will be divided into. If :attr:`num_or_sections`
+            is a list of integers, the length of list indicates the number of
+            sub-tensors and the integers indicate the sizes of sub-tensors'
             :attr:`dim` dimension orderly.
-        dim (int): The dimension along which to split. If :math:`dim < 0`, the 
+        dim (int): The dimension along which to split. If :math:`dim < 0`, the
             dimension to split along is :math:`rank(input) + dim`.
 
     Returns:
@@ -1597,3 +1598,39 @@ def split(input, num_or_sections, dim=-1):
             'axis': dim
         })
     return outs
+
+
+def greedy_ctc_evaluator(input, label, blank, normalized=False, name=None):
+    """
+    """
+
+    helper = LayerHelper("greedy_ctc_evalutor", **locals())
+    # top 1 op
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": 1})
+
+    # ctc align op
+    ctc_out = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="ctc_align",
+        inputs={"Input": [topk_indices]},
+        outputs={"Out": [ctc_out]},
+        attrs={"merge_repeated": True,
+               "blank": blank})
+
+    # edit distance op
+    edit_distance_out = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="edit_distance",
+        inputs={"Hyps": [ctc_out],
+                "Refs": [label]},
+        outputs={"Out": [edit_distance_out]},
+        attrs={"normalized": normalized})
+
+    return edit_distance_out

From a12db45c20abd00aeac5106d48a5f9260c7416cc Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 18 Jan 2018 13:41:36 +0800
Subject: [PATCH 22/54] rename libpaddle_fluid_shared.so to libpaddle_fluid.so

---
 paddle/inference/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index af882f252b..33b638c48e 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -16,6 +16,8 @@ target_circle_link_libraries(paddle_fluid_shared
   ARCHIVE_END
   ${FLUID_CORE_MODULES})
 
+SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
+
 # install library & headers
 if(NOT WITH_C_API AND WITH_FLUID)
   install(FILES inference.h DESTINATION include/paddle/inference)

From 082c302c3f1a2e289808829bcdd3db0a8eb5a853 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 18 Jan 2018 16:58:54 +0800
Subject: [PATCH 23/54] Add comments

---
 doc/api/v2/fluid/layers.rst         |  5 +++++
 python/paddle/v2/fluid/layers/nn.py | 31 ++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 62c154e65d..1b40a495d6 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -493,3 +493,8 @@ swish
 ------
 ..  autofunction:: paddle.v2.fluid.layers.swish
     :noindex:
+
+greedy_ctc_error
+------
+..  autofunction:: paddle.v2.fluid.layers.greedy_ctc_error
+    :noindex:
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 8572b422e5..c786f3128b 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -50,7 +50,7 @@ __all__ = [
     'sequence_last_step',
     'dropout',
     'split',
-    'greedy_ctc_evaluator',
+    'greedy_ctc_error',
 ]
 
 
@@ -1600,11 +1600,32 @@ def split(input, num_or_sections, dim=-1):
     return outs
 
 
-def greedy_ctc_evaluator(input, label, blank, normalized=False, name=None):
-    """
+def greedy_ctc_error(input, label, blank, normalized=False, name=None):
     """
+    This evaluator is to calculate sequence-to-sequence edit distance.
+
+    Args:
+
+        input(Variable): (LodTensor, default: LoDTensor<float>), the unscaled probabilities of variable-length sequences, which is a 2-D Tensor with LoD information. It's shape is [Lp, num_classes + 1], where Lp is the sum of all input sequences' length and num_classes is the true number of classes. (not including the blank label).
+
+        label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth of variable-length sequence, which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1], where Lg is th sum of all labels' length.
+
+        blank(int): the blank label index of Connectionist Temporal Classification (CTC) loss, which is in thehalf-opened interval [0, num_classes + 1).
+
+        normalized(bool): Indicated whether to normalize the edit distance by the length of reference string.
 
-    helper = LayerHelper("greedy_ctc_evalutor", **locals())
+    Returns:
+        Variable: sequence-to-sequence edit distance loss in shape [batch_size, 1].
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[8], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+            cost = fluid.layers.greedy_ctc_error(input=x,label=y, blank=0)
+    """
+    helper = LayerHelper("greedy_ctc_error", **locals())
     # top 1 op
     topk_out = helper.create_tmp_variable(dtype=input.dtype)
     topk_indices = helper.create_tmp_variable(dtype="int64")
@@ -1620,7 +1641,7 @@ def greedy_ctc_evaluator(input, label, blank, normalized=False, name=None):
     helper.append_op(
         type="ctc_align",
         inputs={"Input": [topk_indices]},
-        outputs={"Out": [ctc_out]},
+        outputs={"Output": [ctc_out]},
         attrs={"merge_repeated": True,
                "blank": blank})
 

From b83ff4514b2bbdce5ec2ba76f8205307c874c574 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 18 Jan 2018 17:52:26 +0800
Subject: [PATCH 24/54] Register reduce_op_kernel with multiple data types

---
 paddle/operators/reduce_op.cc                  | 14 +++++++++++++-
 paddle/operators/reduce_op.cu                  | 14 +++++++++++++-
 python/paddle/v2/fluid/tests/test_reduce_op.py | 14 +++++++-------
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 09b7091358..4a06babeda 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -190,10 +190,22 @@ REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
 #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
   REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
                          ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           float, ops::functor>);              \
+                                           float, ops::functor>,               \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           double, ops::functor>,              \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           int, ops::functor>,                 \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           int64_t, ops::functor>);            \
   REGISTER_OP_CPU_KERNEL(                                                      \
       reduce_type##_grad,                                                      \
       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
+                            ops::grad_functor>,                                \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
+                            ops::grad_functor>,                                \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
+                            ops::grad_functor>,                                \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
                             ops::grad_functor>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu
index 1dd948ed8a..4ed1e051db 100644
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
@@ -20,10 +20,22 @@ namespace ops = paddle::operators;
 #define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
   REGISTER_OP_CUDA_KERNEL(                                                \
       reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
-                                     float, ops::functor>);               \
+                                     float, ops::functor>,                \
+      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
+                        ops::functor>,                                    \
+      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
+                        ops::functor>,                                    \
+      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
+                        ops::functor>);                                   \
   REGISTER_OP_CUDA_KERNEL(                                                \
       reduce_type##_grad,                                                 \
       ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
+                            ops::grad_functor>,                           \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
+                            ops::grad_functor>,                           \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
+                            ops::grad_functor>,                           \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
                             ops::grad_functor>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/python/paddle/v2/fluid/tests/test_reduce_op.py b/python/paddle/v2/fluid/tests/test_reduce_op.py
index 57ee307ba6..e2df0395ea 100644
--- a/python/paddle/v2/fluid/tests/test_reduce_op.py
+++ b/python/paddle/v2/fluid/tests/test_reduce_op.py
@@ -19,7 +19,7 @@ from op_test import OpTest
 class TestSumOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
         self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
 
     def test_check_output(self):
@@ -32,7 +32,7 @@ class TestSumOp(OpTest):
 class TestMeanOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
         self.attrs = {'dim': 1}
         self.outputs = {'Out': self.inputs['X'].mean(axis=self.attrs['dim'])}
 
@@ -48,7 +48,7 @@ class TestMaxOp(OpTest):
 
     def setUp(self):
         self.op_type = "reduce_max"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
         self.attrs = {'dim': -1}
         self.outputs = {'Out': self.inputs['X'].max(axis=self.attrs['dim'])}
 
@@ -61,7 +61,7 @@ class TestMinOp(OpTest):
 
     def setUp(self):
         self.op_type = "reduce_min"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
         self.attrs = {'dim': 2}
         self.outputs = {'Out': self.inputs['X'].min(axis=self.attrs['dim'])}
 
@@ -72,7 +72,7 @@ class TestMinOp(OpTest):
 class TestKeepDimReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
         self.attrs = {'dim': -2, 'keep_dim': True}
         self.outputs = {
             'Out': self.inputs['X'].sum(axis=self.attrs['dim'], keepdims=True)
@@ -88,7 +88,7 @@ class TestKeepDimReduce(OpTest):
 class Test1DReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random(20).astype("float32")}
+        self.inputs = {'X': np.random.random(20).astype("float64")}
         self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
 
     def test_check_output(self):
@@ -101,7 +101,7 @@ class Test1DReduce(OpTest):
 class TestReduceAll(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
         self.attrs = {'reduce_all': True}
         self.outputs = {'Out': self.inputs['X'].sum()}
 

From 4673a4a9aa2c4c3d2cf487cacc841d59e817dfac Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 18 Jan 2018 20:40:47 +0800
Subject: [PATCH 25/54] divide this operator into ctc_greedy_decoder and
 edit_distance_error.

---
 doc/api/v2/fluid/layers.rst         |  9 ++-
 python/paddle/v2/fluid/layers/nn.py | 99 +++++++++++++++++++++++------
 2 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index aae63a9ad0..f1e4e753c5 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -500,9 +500,14 @@ swish
 ..  autofunction:: paddle.v2.fluid.layers.swish
     :noindex:
 
-greedy_ctc_error
+edit_distance_error
 ---------------
-..  autofunction:: paddle.v2.fluid.layers.greedy_ctc_error
+..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
+    :noindex:
+
+ctc_greedy_decoder
+---------------
+..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
     :noindex:
 
 l2_normalize
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 60f2fd8e9d..72246304be 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -50,7 +50,8 @@ __all__ = [
     'sequence_last_step',
     'dropout',
     'split',
-    'greedy_ctc_error',
+    'ctc_greedy_decoder',
+    'edit_distance_error',
     'l2_normalize',
     'matmul',
 ]
@@ -1791,17 +1792,21 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
-def greedy_ctc_error(input, label, blank, normalized=False, name=None):
+def edit_distance_error(input, label, normalized=False, name=None):
     """
-    This evaluator is to calculate sequence-to-sequence edit distance.
+    EditDistance operator computes the edit distances between a batch of hypothesis strings and their references.Edit distance, also called Levenshtein distance, measures how dissimilar two strings are by counting the minimum number of operations to transform one string into anthor. Here the operations include insertion, deletion, and substitution. For example, given hypothesis string A = "kitten" and reference B = "sitting", the edit distance is 3 for A will be transformed into B at least after two substitutions and one insertion:
 
-    Args:
+       "kitten" -> "sitten" -> "sittin" -> "sitting"
 
-        input(Variable): (LodTensor, default: LoDTensor<float>), the unscaled probabilities of variable-length sequences, which is a 2-D Tensor with LoD information. It's shape is [Lp, num_classes + 1], where Lp is the sum of all input sequences' length and num_classes is the true number of classes. (not including the blank label).
+    Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total number denoted by `batch_size`, and the separation is specified by the LoD information. And the `batch_size` reference strings are arranged in order in the same way in the LoDTensor Input(Refs).
 
-        label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth of variable-length sequence, which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1], where Lg is th sum of all labels' length.
+    Output(Out) contains the `batch_size` results and each stands for the edit stance for a pair of strings respectively. If Attr(normalized) is true, the edit distance will be divided by the length of reference string.
 
-        blank(int): the blank label index of Connectionist Temporal Classification (CTC) loss, which is in thehalf-opened interval [0, num_classes + 1).
+    Args:
+
+        input(Variable): The indices for hypothesis strings.
+
+        label(Variable): The indices for reference strings.
 
         normalized(bool): Indicated whether to normalize the edit distance by the length of reference string.
 
@@ -1812,11 +1817,73 @@ def greedy_ctc_error(input, label, blank, normalized=False, name=None):
         .. code-block:: python
 
             x = fluid.layers.data(name='x', shape=[8], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[7], dtype='float32')
+
+            cost = fluid.layers.edit_distance_error(input=x,label=y)
+    """
+    helper = LayerHelper("edit_distance_error", **locals())
+
+    # edit distance op
+    edit_distance_out = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="edit_distance",
+        inputs={"Hyps": [input],
+                "Refs": [label]},
+        outputs={"Out": [edit_distance_out]},
+        attrs={"normalized": normalized})
+
+    return edit_distance_out
+
+
+def ctc_greedy_decoder(input, blank, name=None):
+    """
+    This op is used to decode sequences by greedy policy by below steps:
+    1. Get the indexes of max value for each row in input. a.k.a. numpy.argmax(input, axis=0).
+    2. For each sequence in result of step1, merge repeated tokens between two blanks and delete all blanks.
+
+    A simple example as below:
+
+    .. code-block:: text
+
+        Given:
+
+        input.data = [[0.6, 0.1, 0.3, 0.1],
+                      [0.3, 0.2, 0.4, 0.1],
+                      [0.1, 0.5, 0.1, 0.3],
+                      [0.5, 0.1, 0.3, 0.1],
+
+                      [0.5, 0.1, 0.3, 0.1],
+                      [0.2, 0.2, 0.2, 0.4],
+                      [0.2, 0.2, 0.1, 0.5],
+                      [0.5, 0.1, 0.3, 0.1]]
+
+        input.lod = [[0, 4, 8]]
+
+        Then:
+
+        output.data = [[2],
+                       [1],
+                       [3]]
+
+        output.lod = [[0, 2, 3]]
+
+    Args:
+
+        input(Variable): (LoDTensor<float>), the probabilities of variable-length sequences, which is a 2-D Tensor with LoD information. It's shape is [Lp, num_classes + 1], where Lp is the sum of all input sequences' length and num_classes is the true number of classes. (not including the blank label).
+
+        blank(int): the blank label index of Connectionist Temporal Classification (CTC) loss, which is in thehalf-opened interval [0, num_classes + 1).
+
+    Returns:
+        Variable: CTC greedy decode result.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[8], dtype='float32')
 
-            cost = fluid.layers.greedy_ctc_error(input=x,label=y, blank=0)
+            cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
     """
-    helper = LayerHelper("greedy_ctc_error", **locals())
+    helper = LayerHelper("ctc_greedy_decoder", **locals())
     # top 1 op
     topk_out = helper.create_tmp_variable(dtype=input.dtype)
     topk_indices = helper.create_tmp_variable(dtype="int64")
@@ -1835,14 +1902,4 @@ def greedy_ctc_error(input, label, blank, normalized=False, name=None):
         outputs={"Output": [ctc_out]},
         attrs={"merge_repeated": True,
                "blank": blank})
-
-    # edit distance op
-    edit_distance_out = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="edit_distance",
-        inputs={"Hyps": [ctc_out],
-                "Refs": [label]},
-        outputs={"Out": [edit_distance_out]},
-        attrs={"normalized": normalized})
-
-    return edit_distance_out
+    return ctc_out

From 5846aab31730cb595f6210bed0758954529fc0f0 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Fri, 19 Jan 2018 14:53:46 +0800
Subject: [PATCH 26/54] 1. Rename 'edit_distance_error' to 'edit_distance' 2.
 Add edit distance evaluator to evaluator.py

---
 doc/api/v2/fluid/layers.rst         |  2 +-
 python/paddle/v2/fluid/evaluator.py | 32 +++++++++++++++++++++++++++++
 python/paddle/v2/fluid/layers/nn.py |  9 ++++----
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index f1e4e753c5..2ae68d01d3 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -500,7 +500,7 @@ swish
 ..  autofunction:: paddle.v2.fluid.layers.swish
     :noindex:
 
-edit_distance_error
+edit_distance
 ---------------
 ..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
     :noindex:
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index adf174a07d..336d25929e 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -204,3 +204,35 @@ class ChunkEvaluator(Evaluator):
             [precision], dtype='float32'), np.array(
                 [recall], dtype='float32'), np.array(
                     [f1_score], dtype='float32')
+
+
+class EditDistance(Evaluator):
+    """
+    Average edit distance error for multiple mini-batches.
+    """
+
+    def __init__(self, input, label, k=1, **kwargs):
+        super(EditDistance, self).__init__("edit_distance", **kwargs)
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.total_error = self.create_state(
+            dtype='int64', shape=[1], suffix='total')
+        self.batch_num = 0
+        error = layers.edit_distance(input=input, label=label)
+        mean_error = layers.mean(input=error)
+        layers.sums(input=[self.total_error, mean_error], out=self.total_error)
+        self.metrics.append(mean_error)
+
+    def eval(self, executor, eval_program=None):
+        self.batch_num += 1
+        if eval_program is None:
+            eval_program = Program()
+        block = eval_program.current_block()
+        with program_guard(main_program=eval_program):
+            total_error = _clone_var_(block, self.total_error)
+            batch_num = layers.fill_constant(
+                shape=[1], value=self.batch_num, dtype="float32")
+            out = layers.elementwise_div(x=total_error, y=batch_num)
+        return np.array(executor.run(eval_program, fetch_list=[out])[0])
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 0c77b89065..8383e43dea 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -28,8 +28,7 @@ __all__ = [
     'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
     'lstm_unit', 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min',
     'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
-    'ctc_greedy_decoder', 'edit_distance_error', 'l2_normalize', 'matmul',
-    'warpctc'
+    'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'warpctc'
 ]
 
 
@@ -1768,7 +1767,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
-def edit_distance_error(input, label, normalized=False, name=None):
+def edit_distance(input, label, normalized=False, name=None):
     """
     EditDistance operator computes the edit distances between a batch of hypothesis strings and their references.Edit distance, also called Levenshtein distance, measures how dissimilar two strings are by counting the minimum number of operations to transform one string into anthor. Here the operations include insertion, deletion, and substitution. For example, given hypothesis string A = "kitten" and reference B = "sitting", the edit distance is 3 for A will be transformed into B at least after two substitutions and one insertion:
 
@@ -1795,9 +1794,9 @@ def edit_distance_error(input, label, normalized=False, name=None):
             x = fluid.layers.data(name='x', shape=[8], dtype='float32')
             y = fluid.layers.data(name='y', shape=[7], dtype='float32')
 
-            cost = fluid.layers.edit_distance_error(input=x,label=y)
+            cost = fluid.layers.edit_distance(input=x,label=y)
     """
-    helper = LayerHelper("edit_distance_error", **locals())
+    helper = LayerHelper("edit_distance", **locals())
 
     # edit distance op
     edit_distance_out = helper.create_tmp_variable(dtype="int64")

From bf33b191d0cbb950d50f003f08ed3f16f0e2b92e Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 18 Jan 2018 18:41:08 +0800
Subject: [PATCH 27/54] Add bipartite matching operator and unit testing.

---
 paddle/operators/bipartite_match_op.cc        | 178 ++++++++++++++++++
 .../v2/fluid/tests/test_bipartite_match_op.py | 100 ++++++++++
 2 files changed, 278 insertions(+)
 create mode 100644 paddle/operators/bipartite_match_op.cc
 create mode 100644 python/paddle/v2/fluid/tests/test_bipartite_match_op.py

diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc
new file mode 100644
index 0000000000..8dbade65a5
--- /dev/null
+++ b/paddle/operators/bipartite_match_op.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class BipartiteMatchOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("DisMat"),
+                   "Input(DisMat) of BipartiteMatch should not be null.");
+
+    auto dims = ctx->GetInputDim("DisMat");
+    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DisMat) must be 2.");
+
+    ctx->SetOutputDim("ColToRowMatchIndices", dims);
+    ctx->SetOutputDim("ColToRowMatchDis", dims);
+  }
+};
+
+template <typename T>
+class BipartiteMatchKernel : public framework::OpKernel<T> {
+ public:
+  // The match_indices must be initialized to -1 at first.
+  // The match_dis must be initialized to 0 at first.
+  void BipartiteMatch(const Tensor& dis, int* match_indices,
+                      T* match_dis) const {
+    int64_t row = dis.dims()[0];
+    int64_t col = dis.dims()[1];
+    auto* dis_data = dis.data<T>();
+    std::vector<int> row_pool;
+    for (int i = 0; i < row; ++i) {
+      row_pool.push_back(i);
+    }
+    while (row_pool.size() > 0) {
+      int max_idx = -1;
+      int max_row_idx = -1;
+      T max_dis = -1;
+      for (int64_t j = 0; j < col; ++j) {
+        if (match_indices[j] != -1) {
+          continue;
+        }
+        for (int k = 0; k < row_pool.size(); ++k) {
+          int m = row_pool[k];
+          // distance is 0 between m-th row and j-th column
+          if (dis_data[m * col + j] < 1e-6) {
+            continue;
+          }
+          if (dis_data[m * col + j] > max_dis) {
+            max_idx = j;
+            max_row_idx = m;
+            max_dis = dis_data[m * col + j];
+          }
+        }
+      }
+      if (max_idx == -1) {
+        // Cannot find good match.
+        break;
+      } else {
+        PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
+        match_indices[max_idx] = max_row_idx;
+        match_dis[max_idx] = max_dis;
+        // Erase the row index.
+        row_pool.erase(
+            std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dis_mat = context.Input<LoDTensor>("DisMat");
+    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
+    auto* match_dis = context.Output<Tensor>("ColToRowMatchDis");
+
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+
+    auto col = dis_mat->dims()[1];
+
+    int64_t n = dis_mat->lod().size() == 0
+                    ? 1
+                    : static_cast<int64_t>(dis_mat->lod().back().size() - 1);
+    match_indices->mutable_data<int>({n, col}, context.GetPlace());
+    match_dis->mutable_data<T>({n, col}, context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, int> iset;
+    iset(dev_ctx, match_indices, static_cast<int>(-1));
+    math::SetConstant<platform::CPUDeviceContext, T> tset;
+    tset(dev_ctx, match_dis, static_cast<T>(0));
+
+    int* indices = match_indices->data<int>();
+    T* dis = match_dis->data<T>();
+    if (n == 1) {
+      BipartiteMatch(*dis_mat, indices, dis);
+    } else {
+      auto lod = dis_mat->lod().back();
+      for (size_t i = 0; i < lod.size() - 1; ++i) {
+        Tensor one_ins = dis_mat->Slice(lod[i], lod[i + 1]);
+        BipartiteMatch(one_ins, indices + i * col, dis + i * col);
+      }
+    }
+  }
+};
+
+class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BipartiteMatchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "DisMat",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[K, M]. It is pair-wise distance matrix between the entities "
+        "represented by each row and each column. For example, assumed one "
+        "entity is A with shape [K], another entity is B with shape [M]. The "
+        "DisMat[i][j] is the distance between A[i] and B[j]. The bigger "
+        "the distance is, the more similar the pairs are. Please note, "
+        "This tensor can contain LoD information to represent a batch of "
+        "inputs. One instance of this batch can contain different numbers of "
+        "entities.");
+    AddOutput("ColToRowMatchIndices",
+              "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
+              "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
+              "means B[j] does not match any entity in i-th instance. "
+              "Otherwise, it means B[j] is matched to row "
+              "RowToColMatchIndices[i][j] in i-th instance. The row number of "
+              "i-th instance is saved in RowToColMatchIndices[i][j].");
+    AddOutput("ColToRowMatchDis",
+              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
+              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
+              "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
+              "RowToColMatchIndices[i][j] = d, and the row offsets of each "
+              "instance are called LoD. Then "
+              "ColToRowMatchDis[i][j] = DisMat[d+LoD[i]][j]");
+    AddComment(R"DOC(
+This operator is a greedy bipartite matching algorithm, which is used to
+obtain the matching with the (greedy) maximum distance based on the input
+distance matrix. There are two outputs to save matched indices and distance.
+And this operator only calculate matched indices from column to row.
+A simple description, this algothrim matched the best (maximum distance)
+row entity to the column entity and the matched indices are not duplicated
+in each row of ColToRowMatchIndices. If the column entity is not matched
+any row entity, set -1 in ColToRowMatchIndices.
+
+Please note that the input DisMat can be LoDTensor (with LoD) or Tensor.
+If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
+If Tensor, the height of ColToRowMatchIndices is 1.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp,
+                  ops::BipartiteMatchOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel<float>,
+                       ops::BipartiteMatchKernel<double>);
diff --git a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
new file mode 100644
index 0000000000..8f1db35d3c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
@@ -0,0 +1,100 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def bipartite_match(distance, match_indices, match_dis):
+    """Bipartite Matching algorithm.
+    Arg:
+        distance (numpy.array) : The distance of two entries with shape [M, N].
+        match_indices (numpy.array): the matched indices from column to row
+            with shape [1, N], it must be initialized to -1.
+        match_dis (numpy.array): The matched distance from column to row
+            with shape [1, N], it must be initialized to 0.
+    """
+    match_pair = []
+    row, col = distance.shape
+    for i in range(row):
+        for j in range(col):
+            match_pair.append((i, j, distance[i][j]))
+
+    match_sorted = sorted(match_pair, key=lambda tup: tup[2], reverse=True)
+
+    row_indices = -1 * np.ones((row, ), dtype=np.int)
+
+    idx = 0
+    for i, j, dis in match_sorted:
+        if idx >= row:
+            break
+        if match_indices[j] == -1 and row_indices[i] == -1 and dis > 0:
+            match_indices[j] = i
+            row_indices[i] = j
+            match_dis[j] = dis
+            idx += 1
+
+
+def batch_bipartite_match(distance, lod):
+    """Bipartite Matching algorithm for batch input.
+    Arg:
+        distance (numpy.array) : The distance of two entries with shape [M, N].
+        lod (list of int): The offsets of each input in this batch.
+    """
+    n = len(lod) - 1
+    m = distance.shape[1]
+    match_indices = -1 * np.ones((n, m), dtype=np.int)
+    match_dis = np.zeros((n, m), dtype=np.float32)
+    for i in range(len(lod) - 1):
+        bipartite_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
+                        match_dis[i, :])
+    return match_indices, match_dis
+
+
+class TestBipartiteMatchOpForWithLoD(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[0, 5, 11, 23]]
+        dis = np.random.random((23, 217)).astype('float32')
+        match_indices, match_dis = batch_bipartite_match(dis, lod[0])
+
+        self.inputs = {'DisMat': (dis, lod)}
+        self.outputs = {
+            'ColToRowMatchIndices': (match_indices),
+            'ColToRowMatchDis': (match_dis),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBipartiteMatchOpWithoutLoD(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[0, 8]]
+        dis = np.random.random((8, 17)).astype('float32')
+        match_indices, match_dis = batch_bipartite_match(dis, lod[0])
+
+        self.inputs = {'DisMat': dis}
+        self.outputs = {
+            'ColToRowMatchIndices': (match_indices),
+            'ColToRowMatchDis': (match_dis),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()

From a8f118ca839a03d84aead834759679948e41f6f5 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Sat, 20 Jan 2018 09:57:34 +0800
Subject: [PATCH 28/54] Add EditDistance to evaluator.py

---
 python/paddle/v2/fluid/evaluator.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index 336d25929e..351db4f12d 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -218,21 +218,23 @@ class EditDistance(Evaluator):
             raise ValueError("You can only invoke Evaluator in root block")
 
         self.total_error = self.create_state(
-            dtype='int64', shape=[1], suffix='total')
-        self.batch_num = 0
+            dtype='float32', shape=[1], suffix='total')
+        self.batch_num = self.create_state(
+            dtype='float32', shape=[1], suffix='total')
         error = layers.edit_distance(input=input, label=label)
-        mean_error = layers.mean(input=error)
+        error = layers.cast(x=error, dtype='float32')
+        mean_error = layers.mean(x=error)
         layers.sums(input=[self.total_error, mean_error], out=self.total_error)
+        const1 = layers.fill_constant(shape=[1], value=1.0, dtype="float32")
+        layers.sums(input=[self.batch_num, const1], out=self.batch_num)
         self.metrics.append(mean_error)
 
     def eval(self, executor, eval_program=None):
-        self.batch_num += 1
         if eval_program is None:
             eval_program = Program()
         block = eval_program.current_block()
         with program_guard(main_program=eval_program):
             total_error = _clone_var_(block, self.total_error)
-            batch_num = layers.fill_constant(
-                shape=[1], value=self.batch_num, dtype="float32")
+            batch_num = _clone_var_(block, self.batch_num)
             out = layers.elementwise_div(x=total_error, y=batch_num)
         return np.array(executor.run(eval_program, fetch_list=[out])[0])

From 0b854bdb8b0aad6360cf2c15b1ca40b52a94d40c Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 22 Jan 2018 09:37:23 +0800
Subject: [PATCH 29/54] Add sequence_erase option into edit distance python API

---
 python/paddle/v2/fluid/layers/nn.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 5d05046bba..c57811df1d 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1864,7 +1864,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
-def edit_distance(input, label, normalized=False, name=None):
+def edit_distance(input, label, normalized=False, tokens=None, name=None):
     """
     EditDistance operator computes the edit distances between a batch of hypothesis strings and their references.Edit distance, also called Levenshtein distance, measures how dissimilar two strings are by counting the minimum number of operations to transform one string into anthor. Here the operations include insertion, deletion, and substitution. For example, given hypothesis string A = "kitten" and reference B = "sitting", the edit distance is 3 for A will be transformed into B at least after two substitutions and one insertion:
 
@@ -1882,6 +1882,8 @@ def edit_distance(input, label, normalized=False, name=None):
 
         normalized(bool): Indicated whether to normalize the edit distance by the length of reference string.
 
+        tokens(list): Tokens that should be removed before calculating edit distance.
+
     Returns:
         Variable: sequence-to-sequence edit distance loss in shape [batch_size, 1].
 
@@ -1895,6 +1897,25 @@ def edit_distance(input, label, normalized=False, name=None):
     """
     helper = LayerHelper("edit_distance", **locals())
 
+    # remove some tokens from input and labels
+    if tokens is not None and len(tokens) > 0:
+        erased_input = helper.create_tmp_variable(dtype="int64")
+        erased_label = helper.create_tmp_variable(dtype="int64")
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [input]},
+            outputs={"Out": [erased_input]},
+            attrs={"tokens": tokens})
+        input = erased_input
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [label]},
+            outputs={"Out": [erase_label]},
+            attrs={"tokens": tokens})
+        label = erased_label
+
     # edit distance op
     edit_distance_out = helper.create_tmp_variable(dtype="int64")
     helper.append_op(

From 500e29a4a4a8d6e70f79cc109f5f43709a4ad605 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 22 Jan 2018 13:11:45 +0800
Subject: [PATCH 30/54] 1. Reduce attributes 2. Rename 'get_output_size' to
 'OutputSize' 3. Remove redundant whitespace char.

---
 paddle/operators/im2sequence_op.cc | 59 ++++++++++++++----------------
 paddle/operators/im2sequence_op.h  | 56 +++++++++++-----------------
 2 files changed, 49 insertions(+), 66 deletions(-)

diff --git a/paddle/operators/im2sequence_op.cc b/paddle/operators/im2sequence_op.cc
index 9b2397bdc8..9c9802c043 100644
--- a/paddle/operators/im2sequence_op.cc
+++ b/paddle/operators/im2sequence_op.cc
@@ -30,28 +30,24 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
 
     auto in_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(in_dim.size(), 4,
-                      "Input(X) format  must be 4D tensor, eg., NCHW.");
+                      "Input(X) format must be 4D tensor, eg., NCHW.");
 
-    int block_height = ctx->Attrs().Get<int>("block_height");
-    int block_width = ctx->Attrs().Get<int>("block_width");
-    int stride_height = ctx->Attrs().Get<int>("stride_height");
-    int stride_width = ctx->Attrs().Get<int>("stride_width");
-    int padding_height = ctx->Attrs().Get<int>("padding_height");
-    int padding_width = ctx->Attrs().Get<int>("padding_width");
+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
     int batch_size = in_dim[0];
     int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    int output_height = get_output_size(img_height, block_height, stride_height,
-                                        padding_height);
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
     int output_width =
-        get_output_size(img_width, block_width, stride_width, padding_width);
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
 
     ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
-                              img_channels * block_height * block_width});
-    // TODO(wanghaoshuang): cal lod in complie time
+                              img_channels * kernels[0] * kernels[1]});
   }
 };
 
@@ -66,26 +62,30 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
              "H: height"
              "W: width");
     AddOutput("Out", "(LodTensor)The output data of im2sequence op,");
-    AddAttr<int>("block_height", "(int)height of block.");
-    AddAttr<int>("block_width", "(int)width of block.");
-    AddAttr<int>("stride_height", "(int)height of stride.");
-    AddAttr<int>("stride_width", "(int)width of stride.");
-    AddAttr<int>("padding_height", "(int)height of padding.");
-    AddAttr<int>("padding_width", "(int)width of padding.");
+    AddAttr<std::vector<int>>("kernels",
+                              "(vector<int>), the "
+                              "kernels(kernel_height, kernel_width)")
+        AddAttr<std::vector<int>>("strides",
+                                  "(vector<int> default:{1, 1}), the "
+                                  "strides(h_stride, w_stride)")
+            .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector<int> default:{0, 0, 0, 0}), the "
+                              "paddings(up_pad, left_pad, down_pad, right_pad)")
+        .SetDefault({0, 0, 0, 0});
     AddComment(R"DOC(
-Convert feature map to minibatch matrix.
-- matirx height is: output_height * output_width
-- matrix width is: block_height * block_width * channels
+This op uses kernels to scan images and converts these images to sequences.
+After expanding, The number of time steps are output_height * output_width
+and the dimension of each time step is kernel_height * kernel_width * channels,
+in which:
 
 output_height =
-    1 + (2 * padding_height + img_height - block_height + stride_height - 1) /
+    1 + (padding_height + padding_down + img_height - kernel_height + stride_height - 1) /
             stride_height;
 output_width =
-    1 + (2 * padding_width + img_width - block_width + stride_width - 1) /
+    1 + (padding_left + padding+right + img_width - kernel_width + stride_width - 1) /
             stride_width;
 
-After expanding, The number of time steps are output_height * output_width
-and the dimension of each time step is block_height * block_width * channels.
 This op can be used after convolution neural network, and before recurrent neural network.
 
 Given:
@@ -109,12 +109,9 @@ x.dims = {2, 2, 3, 3}
 
 And:
 
-block_height = 2
-block_width = 2
-stride_height = 1
-stride_width = 1
-padding_height = 0
-padding_width = 0
+kernels = [2, 2]
+strides = [1, 1]
+paddings = [0, 0, 0, 0]
 
 Then:
 
diff --git a/paddle/operators/im2sequence_op.h b/paddle/operators/im2sequence_op.h
index 85d6cac444..352d290b1b 100644
--- a/paddle/operators/im2sequence_op.h
+++ b/paddle/operators/im2sequence_op.h
@@ -26,9 +26,11 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-inline int get_output_size(int img_size, int block_size, int stride,
-                           int padding) {
-  return (1 + (img_size + 2 * padding - block_size + stride - 1) / stride);
+inline int OutputSize(int input_size, int filter_size, int padding_0,
+                      int padding_1, int stride) {
+  const int output_size =
+      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
+  return output_size;
 }
 
 template <typename DeviceContext, typename T>
@@ -47,32 +49,24 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
     int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
-    int block_height = ctx.Attr<int>("block_height");
-    int block_width = ctx.Attr<int>("block_width");
-    int stride_height = ctx.Attr<int>("stride_height");
-    int stride_width = ctx.Attr<int>("stride_width");
-    int padding_height = ctx.Attr<int>("padding_height");
-    int padding_width = ctx.Attr<int>("padding_width");
-
-    int output_height = get_output_size(img_height, block_height, stride_height,
-                                        padding_height);
+
+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    int output_height =
+        OutputSize(img_height, kernels[0], paddings[0], paddings[2] strides[0]);
     int output_width =
-        get_output_size(img_width, block_width, stride_width, padding_width);
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
 
     const std::vector<int> dilations({1, 1});
-    const std::vector<int> strides(
-        {stride_height, stride_width, stride_height, stride_width});
-    const std::vector<int> paddings(
-        {padding_height, padding_width, padding_height, padding_width});
 
     auto out_dims = out->dims();
     out->Resize({batch_size, out->numel() / batch_size});
     for (int i = 0; i < batch_size; i++) {
       const Tensor src =
           in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      Tensor dst = out->Slice(i, i + 1).Resize({output_height, output_width,
-                                                img_channels, block_height,
-                                                block_width});
+      Tensor dst = out->Slice(i, i + 1).Resize(
+          {output_height, output_width, img_channels, kernels[0], kernels[1]});
 
       math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
@@ -112,22 +106,15 @@ class Im2SequenceGradKernel : public framework::OpKernel<T> {
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    int block_height = ctx.Attr<int>("block_height");
-    int block_width = ctx.Attr<int>("block_width");
-    int stride_height = ctx.Attr<int>("stride_height");
-    int stride_width = ctx.Attr<int>("stride_width");
-    int padding_height = ctx.Attr<int>("padding_height");
-    int padding_width = ctx.Attr<int>("padding_width");
-    int output_height = get_output_size(img_height, block_height, stride_height,
-                                        padding_height);
+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
     int output_width =
-        get_output_size(img_width, block_width, stride_width, padding_width);
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
 
     const std::vector<int> dilations({1, 1});
-    const std::vector<int> strides(
-        {stride_height, stride_width, stride_height, stride_width});
-    const std::vector<int> paddings(
-        {padding_height, padding_width, padding_height, padding_width});
 
     auto d_out_dims = d_out->dims();
     d_out->Resize({batch_size, d_out->numel() / batch_size});
@@ -135,8 +122,7 @@ class Im2SequenceGradKernel : public framework::OpKernel<T> {
       Tensor dst =
           d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
       const Tensor src = d_out->Slice(i, i + 1).Resize(
-          {output_height, output_width, img_channels, block_height,
-           block_width});
+          {output_height, output_width, img_channels, kernels[0], kernels[1]});
       math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
       f(dev_ctx, src, dilations, strides, paddings, &dst);

From f6cea357432acbd70f459ee35103d2c48f152f36 Mon Sep 17 00:00:00 2001
From: ying <lcy.seso@gmail.com>
Date: Mon, 22 Jan 2018 14:12:55 +0800
Subject: [PATCH 31/54] fix rendering error of transpose operator.

---
 paddle/operators/transpose_op.cc | 51 ++++++++++++++------------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index 11615d806a..c7ae162638 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -59,44 +59,39 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
-        "(Tensor)The input tensor, tensors with rank at most 6 are supported");
-    AddOutput("Out", "(Tensor)The output tensor");
+        "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
+    AddOutput("Out", "(Tensor)The output tensor.");
     AddAttr<std::vector<int>>(
         "axis",
-        "(vector<int>)A list of values, and the size of the list should be "
-        "the same with the input tensor rank, the tensor will "
-        "permute the axes according the the values given");
+        "(vector<int>) A list of values, and the size of the list should be "
+        "the same with the input tensor rank. This operator permutes the input "
+        "tensor's axes according to the values given.");
     AddComment(R"DOC(
 Transpose Operator.
 
-The input tensor will be permuted according to the axis values given.
-The op functions is similar to how numpy.transpose works in python.
+The input tensor will be permuted according to the axes given.
+The behavior of this operator is similar to how `numpy.transpose` works.
 
-For example:
+- suppose the input `X` is a 2-D tensor:
+    $$
+    X = \begin{pmatrix}
+    0 &1 &2 \\
+    3 &4 &5
+    \end{pmatrix}$$
 
-    .. code-block:: text
+    the given `axes` is: $[1, 0]$, and $Y$ = transpose($X$, axis)
 
-      input = numpy.arange(6).reshape((2,3))
+    then the output $Y$ is:
 
-      the input is:
+    $$
+    Y = \begin{pmatrix}
+         0 &3 \\
+         1 &4  \\
+         2 &5
+    \end{pmatrix}$$
 
-      array([[0, 1, 2],
-             [3, 4, 5]])
-
-      given axis is:
-
-      [1, 0]
-
-      output = input.transpose(axis)
-
-      then the output is:
-
-      array([[0, 3],
-             [1, 4],
-             [2, 5]])
-
-So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1},
-the output tensor shape will be (N, H, W, C)
+- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is 
+$[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
 
 )DOC");
   }

From 867001bd5e5119c173d1a44bce9347dca8ba40b4 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 22 Jan 2018 14:49:24 +0800
Subject: [PATCH 32/54] remove unused v1 api doc

---
 doc/api/v1/data_provider/dataprovider_cn.rst  |  15 --
 doc/api/v1/data_provider/dataprovider_en.rst  |  34 ---
 .../v1/data_provider/pydataprovider2_cn.rst   | 229 ----------------
 .../v1/data_provider/pydataprovider2_en.rst   | 249 ------------------
 doc/api/v1/data_provider/src/mnist_config.py  |  24 --
 .../data_provider/src/mnist_provider.dict.py  |  38 ---
 doc/api/v1/data_provider/src/mnist_train.txt  |   3 -
 .../data_provider/src/sentimental_config.py   |  28 --
 .../data_provider/src/sentimental_provider.py |  57 ----
 .../data_provider/src/sentimental_train.txt   |   3 -
 doc/api/v1/data_provider/src/train.list       |   1 -
 doc/api/v1/index_cn.rst                       |  37 ---
 doc/api/v1/index_en.rst                       |  37 ---
 doc/api/v1/predict/src/predict_sample.py      | 135 ----------
 doc/api/v1/predict/swig_py_paddle_cn.rst      |  58 ----
 doc/api/v1/predict/swig_py_paddle_en.rst      |  59 -----
 16 files changed, 1007 deletions(-)
 delete mode 100644 doc/api/v1/data_provider/dataprovider_cn.rst
 delete mode 100644 doc/api/v1/data_provider/dataprovider_en.rst
 delete mode 100644 doc/api/v1/data_provider/pydataprovider2_cn.rst
 delete mode 100644 doc/api/v1/data_provider/pydataprovider2_en.rst
 delete mode 100644 doc/api/v1/data_provider/src/mnist_config.py
 delete mode 100644 doc/api/v1/data_provider/src/mnist_provider.dict.py
 delete mode 100644 doc/api/v1/data_provider/src/mnist_train.txt
 delete mode 100644 doc/api/v1/data_provider/src/sentimental_config.py
 delete mode 100644 doc/api/v1/data_provider/src/sentimental_provider.py
 delete mode 100644 doc/api/v1/data_provider/src/sentimental_train.txt
 delete mode 100644 doc/api/v1/data_provider/src/train.list
 delete mode 100644 doc/api/v1/index_cn.rst
 delete mode 100644 doc/api/v1/index_en.rst
 delete mode 100644 doc/api/v1/predict/src/predict_sample.py
 delete mode 100644 doc/api/v1/predict/swig_py_paddle_cn.rst
 delete mode 100644 doc/api/v1/predict/swig_py_paddle_en.rst

diff --git a/doc/api/v1/data_provider/dataprovider_cn.rst b/doc/api/v1/data_provider/dataprovider_cn.rst
deleted file mode 100644
index d08c6b3efa..0000000000
--- a/doc/api/v1/data_provider/dataprovider_cn.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. _api_dataprovider:
-
-DataProvider的介绍
-==================
-
-DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存，让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 :ref:`api_pydataprovider2` ，来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 ``DataProvider`` 。
-
-PaddlePaddle需要用户在网络配置（trainer_config.py）中定义使用哪种DataProvider，并且在DataProvider中实现如何访问训练文件列表（train.list）或测试文件列表（test.list）。
-
-- train.list和test.list存放在本地（推荐直接存放到训练目录，以相对路径引用)。一般情况下，两者均为纯文本文件，其中每一行对应一个数据文件地址：
-  
-  - 如果数据文件存于本地磁盘，这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。
-  - 地址也可以为hdfs文件路径，或者数据库连接路径等。
-  - 由于这个地址会被DataProvider使用，因此，如何解析该地址也是用户自定义DataProvider时需要考虑的地方。
-- 如果没有设置test.list，或设置为None，那么在训练过程中不会执行测试操作；否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
diff --git a/doc/api/v1/data_provider/dataprovider_en.rst b/doc/api/v1/data_provider/dataprovider_en.rst
deleted file mode 100644
index 96efbb1da9..0000000000
--- a/doc/api/v1/data_provider/dataprovider_en.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-Introduction
-==============
-DataProvider is a module that loads training or testing data into cpu or gpu
-memory for the following triaining or testing process.
-
-For simple use, users can use Python :code:`PyDataProvider` to dynamically reads
-the original data in any format or in any form, and then transfer them into a
-data format PaddlePaddle requires. The process is extremly flexible and highly
-customized, with sacrificing the efficiency only a little. This is extremly
-useful when you have to dynamically generate certain kinds of data according to,
-for example, the training performance.
-
-Besides, users also can customize a C++ :code:`DataProvider` for a more
-complex usage, or for a higher efficiency.
-
-The following parameters are required to define in the PaddlePaddle network
-configuration file (trainer_config.py): which DataProvider is chosen to used,
-and specific parameters for DataProvider, including training file list
-(train.list) and testing file list (test.list).
-
-Train.list and test.list are simply two plain text files, which defines path
-of training or testing data. It is recommended that directly placing them into
-the training directory, and reference to them by using a relative path (
-relative to the PaddePaddle program).
-
-Testing or evaluating will not be performed during training if the test.list is
-not set or set to None. Otherwise, PaddlePaddle will evaluate the trained model
-by the specified tesing data while training, every testing period (a user
-defined command line parameter in PaddlePaddle) to prevent over-fitting.
-
-Each line of train.list and test.list is an absolute or relative path (relative
-to the PaddePaddle program runtime) of data file. Fascinatingly more, each line
-can also be a HDFS file path or a SQL connection string. As long as the user
-assures how to access each file in DataProvider.
diff --git a/doc/api/v1/data_provider/pydataprovider2_cn.rst b/doc/api/v1/data_provider/pydataprovider2_cn.rst
deleted file mode 100644
index 8f9db31cfb..0000000000
--- a/doc/api/v1/data_provider/pydataprovider2_cn.rst
+++ /dev/null
@@ -1,229 +0,0 @@
-..  _api_pydataprovider2:
-
-PyDataProvider2的使用
-=====================
-
-PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据，并提供了简单的Cache功能；同时可以使用户只关注如何从文件中读取每一条数据，而不用关心数据如何传输，如何存储等等。
-
-..  contents::
-
-MNIST的使用场景
----------------
-
-我们以MNIST手写识别为例，来说明PyDataProvider2的简单使用场景。
-
-样例数据
-++++++++
-
-MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下：
-
-..  literalinclude:: src/mnist_train.txt
-
-其中每行数据代表一张图片，行内使用 ``;`` 分成两部分。第一部分是图片的标签，为0-9中的一个数字；第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字：
-
-..  literalinclude:: src/train.list
-
-dataprovider的使用
-++++++++++++++++++
-
-..  literalinclude:: src/mnist_provider.dict.py
-
-- 首先，引入PaddlePaddle的PyDataProvider2包。
-- 其次，定义一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2，同时设置它的input_types属性。
-  
-  - `input_types`_：设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字，显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。
-
-    ..  literalinclude:: src/mnist_config.py
-         :lines: 9-10
-
-  - 注意：如果用户不显示指定返回数据的对应关系，那么PaddlePaddle会根据layer的声明顺序，来确定对应关系。但这个关系可能不正确，所以推荐使用显式指定的方式来设置input_types。
-- 最后，实现数据输入函数（如本例的 ``process`` 函数）。
-
-  - 该函数的功能是：打开文本文件，读取每一行，将行中的数据转换成与input_types一致的格式，然后返回给PaddlePaddle进程。注意，
-    
-    - 返回的顺序需要和input_types中定义的顺序一致。
-    - 返回时，必须使用Python关键词 ``yield`` ，相关概念是 ``generator`` 。
-    - 一次yield调用，返回一条完整的样本。如果想为一个数据文件返回多条样本，只需要在函数中调用多次yield即可（本例中使用for循环进行多次调用）。
-  
-  - 该函数具有两个参数：
-  
-    - settings：在本例中没有使用，具体可以参考 `init_hook`_ 中的说明。
-    - filename：为 ``train.list`` 或 ``test.list`` 中的一行，即若干数据文件路径的某一个。
-
-网络配置中的调用
-++++++++++++++++
-
-在网络配置里，只需要一行代码就可以调用这个PyDataProvider2，如，
-
-..  literalinclude:: src/mnist_config.py
-     :lines: 1-7
-
-训练数据是 ``train.list`` ，没有测试数据，调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。
-
-小结
-+++++
-
-至此，简单的PyDataProvider2样例就说明完毕了。对用户来说，仅需要知道如何从 **一个文件** 中读取 **一条样本** ，就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作：
-
-* 将数据组合成Batch进行训练
-* 对训练数据进行Shuffle
-* 多线程的数据读取
-* 缓存训练数据到内存(可选)
-* CPU->GPU双缓存
-
-是不是很简单呢？
-
-时序模型的使用场景
-------------------
-样例数据
-++++++++
-
-时序模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列数据。
-
-本例采用英文情感分类的数据，即将一段英文文本数据，分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下：
-
-..  literalinclude:: src/sentimental_train.txt
-
-dataprovider的使用
-++++++++++++++++++
-
-相对MNIST而言，这个dataprovider较复杂，主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的，它会在dataprovider创建的时候执行。
-
-- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列，因此使用 ``integer_value_sequence`` 类型来设置。
-- 将 ``dictionary`` 存入settings对象，在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象，即一个将单词字符串映射到单词ID的字典。
-
-..  literalinclude:: src/sentimental_provider.py
-
-网络配置中的调用
-++++++++++++++++
-
-调用这个PyDataProvider2的方法，基本上和MNIST样例一致，除了
-
-* 在配置中需要读取外部字典。
-* 在声明DataProvider的时候传入dictionary作为参数。
-
-..  literalinclude:: src/sentimental_config.py
-     :emphasize-lines: 12-14
-
-参考(Reference)
----------------
-
-@provider
-+++++++++
-
-``@provider`` 是一个Python的 `Decorator`_ ，可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系，只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下:
-
-*  input_types：数据输入格式。具体的格式说明，请参考 `input_types`_ 。
-*  should_shuffle：是不是要对数据做Shuffle。训练时默认shuffle，测试时默认不shuffle。
-*  min_pool_size：设置内存中最小暂存的数据条数，也是PaddlePaddle所能够保证的shuffle粒度。如果为-1，则会预先读取全部数据到内存中。
-*  pool_size： 设置内存中暂存的数据条数。如果为-1（默认），则不在乎内存暂存多少条数据。如果设置，则推荐大于训练时batch size的值，并且在内存足够的情况下越大越好。
-*  can_over_batch_size：是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题，一般推荐设置成True。
-*  calc_batch_size：可以传入一个函数，用于自定义每条数据的batch size（默认为1）。
-*  cache： 数据缓存的策略，具体请参考 `cache`_ 。
-*  init_hook：初始化时调用的函数，具体请参考 `init_hook`_ 。
-*  check：如果为true，会根据input_types检查数据的合法性。
-*  check_fail_continue：如果为true，那么当check出数据不合法时，会扔到这条数据，继续训练或预测。（对check=false的情况，没有作用）
-
-input_types
-+++++++++++
-
-PaddlePaddle的数据包括四种主要类型，和三种序列模式。
-
-四种数据类型：
-
-* dense_vector：稠密的浮点数向量。
-* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
-* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
-* integer：整数标签。
-
-三种序列模式：
-
-* SequenceType.NO_SEQUENCE：不是一条序列
-* SequenceType.SEQUENCE：是一条时间序列
-* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
-
-不同的数据类型和序列模式返回的格式不同，列表如下：
-
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
-+======================+=====================+===================================+================================================+
-| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-
-其中，f代表一个浮点数，i代表一个整数。
-
-注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
-
-- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
-- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
-
-init_hook
-+++++++++
-
-init_hook可以传入一个函数。该函数在初始化的时候会被调用，其参数如下:
-
-* 第一个参数是settings对象，它和数据传入函数的第一个参数（如本例中 ``process`` 函数的 ``settings`` 参数）必须一致。该对象具有以下两个属性：
-    * settings.input_types：数据输入格式，具体请参考 `input_types`_ 。
-    * settings.logger：一个logging对象。
-* 其他参数使用 ``kwargs`` （key word arguments）传入，包括以下两种：
-    * PaddlePaddle定义的参数: 1）is_train：bool型参数，表示用于训练或预测；2）file_list：所有文件列表。
-    * 用户定义的参数：使用args在网络配置中设置。
-
-注意：PaddlePaddle保留添加参数的权力，因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。
-
-cache
-+++++
-
-PyDataProvider2提供了两种简单的Cache策略：
-
-* CacheType.NO_CACHE：不缓存任何数据，每次都会从python端读取数据
-* CacheType.CACHE_PASS_IN_MEM：第一个pass会从python端读取数据，剩下的pass会直接从内存里
-  读取数据。 
-
-
-注意事项
---------
-
-可能的内存泄露问题
-++++++++++++++++++
-
-PaddlePaddle将train.list中的每一行都传递给process函数，从而生成多个generator。当训练数据非常多时，就会生成非常多的generator。
-
-虽然每个generator在没有调用的时候，是几乎不占内存的；但当调用过一次后，generator便会存下当前的上下文(Context)，而这个Context可能会非常大。并且，generator至少需要调用两次才会知道是否停止。所以，即使process函数里面只有一个yield，也需要两次随机选择到相同generator的时候，才会释放该段内存。
-
-..  code-block:: python
-
-    def func():
-        yield 0
-
-    f = func()  # 创建generator
-    tmp = next(f)  # 调用一次，返回0
-    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
-
-由于顺序调用这些generator不会出现上述问题，因此有两种解决方案：
-
-1. **最佳推荐**：将样本的地址放入另一个文本文件，train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。
-2. 在generator的上下文中尽量留下非常少的变量引用，例如
-
-..  code-block:: python
-
-    def real_process(fn):
-        # ... read from fn
-        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
-
-    def process(fn):
-        yield real_process(fn)
-
-注意：这个问题是PyDataProvider读数据时候的逻辑问题，很难整体修正。
-
-内存不够用的情况
-++++++++++++++++
-
-PyDataProvider2会尽可能多的使用内存。因此，对于内存较小的机器，推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。
-
diff --git a/doc/api/v1/data_provider/pydataprovider2_en.rst b/doc/api/v1/data_provider/pydataprovider2_en.rst
deleted file mode 100644
index e8fb629277..0000000000
--- a/doc/api/v1/data_provider/pydataprovider2_en.rst
+++ /dev/null
@@ -1,249 +0,0 @@
-..  _api_pydataprovider2:
-
-PyDataProvider2
-===============
-
-We highly recommand users to use PyDataProvider2 to provide training or testing
-data to PaddlePaddle. The user only needs to focus on how to read a single
-sample from the original data file by using PyDataProvider2, leaving all of the
-trivial work, including, transfering data into cpu/gpu memory, shuffle, binary
-serialization to PyDataProvider2. PyDataProvider2 uses multithreading and a
-fanscinating but simple cache strategy to optimize the efficiency of the data
-providing process.
-
-DataProvider for the non-sequential model
------------------------------------------
-
-Here we use the MNIST handwriting recognition data as an example to illustrate
-how to write a simple PyDataProvider.
-
-MNIST is a handwriting classification data set. It contains 70,000 digital
-grayscale images. Labels of the training sample range from 0 to 9. All the
-images have been size-normalized and centered into images with the same size
-of 28 x 28 pixels.
-
-A small part of the original data as an example is shown as below:
-
-.. literalinclude:: src/mnist_train.txt
-
-Each line of the data contains two parts, separated by :code:`;`. The first part is
-label of an image. The second part contains 28x28 pixel float values.
-
-Just write path of the above data into train.list. It looks like this:
-
-.. literalinclude:: src/train.list
-
-The corresponding dataprovider is shown as below:
-
-.. literalinclude:: src/mnist_provider.dict.py
-
-The first line imports PyDataProvider2 package.
-The main function is the process function, that has two parameters.
-The first parameter is the settings, which is not used in this example.
-The second parameter is the filename, that is exactly each line of train.list.
-This parameter is passed to the process function by PaddlePaddle.
-
-:code:`@provider` is a Python
-`Decorator <http://www.learnpython.org/en/Decorators>`_ .
-It sets some properties to DataProvider, and constructs a real PaddlePaddle
-DataProvider from a very simple user implemented python function. It does not
-matter if you are not familiar with `Decorator`_. You can keep it simple by
-just taking :code:`@provider` as a fixed mark above the provider function you
-implemented.
-
-`input_types`_ defines the data format that a DataProvider returns.
-In this example, it is set to a 28x28-dimensional dense vector and an integer
-scalar, whose value ranges from 0 to 9.
-`input_types`_ can be set to several kinds of input formats, please refer to the
-document of `input_types`_ for more details.
-
-
-The process method is the core part to construct a real DataProvider in
-PaddlePaddle. It implements how to open the text file, how to read one sample
-from the original text file, convert them into `input_types`_, and give them
-back to PaddlePaddle process at line 23.
-Note that data yielded by the process function must follow the same order that
-`input_types`_ are defined.
-
-
-With the help of PyDataProvider2, user can focus on how to generate ONE traning
-sample by using keywords :code:`yield`.
-:code:`yield` is a python keyword, and a concept related to it includes
-:code:`generator`.
-
-Only a few lines of codes need to be added into the training configuration file,
-you can take this as an example.
-
-.. literalinclude:: src/mnist_config.py
-
-Here we specify training data by :code:`train.list`, and no testing data is specified.
-The method which actually provide data is :code:`process`.
-
-User also can use another style to provide data, which defines the
-:code:`data_layer`'s name explicitly when `yield`. For example,
-the :code:`dataprovider` is shown as below.
-
-.. literalinclude:: src/mnist_provider.dict.py
-   :linenos:
-
-If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
-the order of :code:`data_layer` definition roughly to determine which feature to
-which :code:`data_layer`. This order may be not correct, so TO DEFINE THE
-:code:`data_layer`'s NAMES EXPLICITLY IS THE RECOMMANDED WAY TO PROVIDER DATA.
-
-Now, this simple example of using PyDataProvider is finished.
-The only thing that the user should know is how to generte **one sample** from
-**one data file**.
-And PaddlePadle will do all of the rest things\:
-
-* Form a training batch
-* Shuffle the training data
-* Read data with multithreading
-* Cache the training data (Optional)
-* CPU-> GPU double buffering.
-
-Is this cool?
-
-..  _api_pydataprovider2_sequential_model:
-
-DataProvider for the sequential model
--------------------------------------
-A sequence model takes sequences as its input. A sequence is made up of several
-timesteps. The so-called timestep, is not necessary to have something to do
-with time. It can also be explained to that the order of data are taken into
-consideration into model design and training.
-For example, the sentence can be interpreted as a kind of sequence data in NLP
-tasks.
-
-Here is an example on data proivider for English sentiment classification data.
-The original input data are simple English text, labeled into positive or
-negative sentiment (marked by 0 and 1 respectively).
-
-A small part of the original data as an example can be found in the path below:
-
-.. literalinclude:: src/sentimental_train.txt
-
-The corresponding data provider can be found in the path below:
-
-.. literalinclude:: src/sentimental_provider.py
-
-This data provider for sequential model is a little more complex than that
-for MINST dataset.
-A new initialization method is introduced here.
-The method :code:`on_init` is configured to DataProvider by :code:`@provider`'s
-:code:`init_hook` parameter, and it will be invoked once DataProvider is
-initialized. The :code:`on_init` function has the following parameters:
-
-* The first parameter is the settings object.
-* The rest parameters are passed by key word arguments. Some of them are passed
-  by PaddlePaddle, see reference for `init_hook`_.
-  The :code:`dictionary` object is a python dict object passed from the trainer
-  configuration file, and it maps word string to word id.
-
-To pass these parameters into DataProvider, the following lines should be added
-into trainer configuration file.
-
-.. literalinclude:: src/sentimental_config.py
-
-The definition is basically same as MNIST example, except:
-* Load dictionary in this configuration
-* Pass it as a parameter to the DataProvider
-
-The `input_types` is configured in method :code:`on_init`. It has the same
-effect to configure them by :code:`@provider`'s :code:`input_types` parameter.
-However, the :code:`input_types` is set at runtime, so we can set it to
-different types according to the input data. Input of the neural network is a
-sequence of word id, so set :code:`seq_type` to :code:`integer_value_sequence`.
-
-Durning :code:`on_init`, we save :code:`dictionary` variable to
-:code:`settings`, and it will be used in :code:`process`. Note the settings
-parameter for the process function and for the on_init's function are a same
-object.
-
-The basic processing logic is the same as MNIST's :code:`process` method. Each
-sample in the data file is given back to PaddlePaddle process.
-
-Thus, the basic usage of PyDataProvider is here.
-Please refer to the following section reference for details.
-
-Reference
----------
-
-@provider
-+++++++++
-
-.. autofunction:: paddle.trainer.PyDataProvider2.provider
-
-input_types
-+++++++++++
-
-PaddlePaddle has four data types, and three sequence types.
-The four data types are:
-
-* :code:`dense_vector`: dense float vector.
-* :code:`sparse_binary_vector`: sparse binary vector, most of the value is 0, and
-  the non zero elements are fixed to 1.
-* :code:`sparse_float_vector`: sparse float vector, most of the value is 0, and some
-  non zero elements can be any float value. They are given by the user.
-* :code:`integer`: an integer scalar, that is especially used for label or word index.
-
-The three sequence types are:
-
-* :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence.
-* :code:`SequenceType.SEQUENCE` means the sample is a sequence.
-* :code:`SequenceType.SUB_SEQUENCE` means it is a nested sequence, that each timestep of
-  the input sequence is also a sequence.
-
-Different input type has a defferenct input format. Their formats are shown
-in the above table.
-
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
-+======================+=====================+===================================+================================================+
-| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-
-where f represents a float value, i represents an integer value.
-
-init_hook
-+++++++++
-
-init_hook is a function that is invoked once the data provoder is initialized.
-Its parameters lists as follows:
-
-* The first parameter is a settings object, which is the same to :code:`settings`
-  in :code:`process` method. The object contains several attributes, including:
-
-  * :code:`settings.input_types`: the input types. Reference `input_types`_.
-  * :code:`settings.logger`: a logging object.
-
-* The rest parameters are the key word arguments. It is made up of PaddpePaddle
-  pre-defined parameters and user defined parameters.
-
-  * PaddlePaddle-defined parameters including:
-
-    * :code:`is_train` is a bool parameter that indicates the DataProvider is used in
-      training or testing.
-    * :code:`file_list` is the list of all files.
-
-  * User-defined parameters args can be set in training configuration.
-
-Note, PaddlePaddle reserves the right to add pre-defined parameter, so please
-use :code:`**kwargs` in init_hook to ensure compatibility by accepting the
-parameters which your init_hook does not use.
-
-cache
-+++++
-DataProvider provides two simple cache strategy. They are:
-
-* :code:`CacheType.NO_CACHE` means do not cache any data, then data is read at runtime by
-  the user implemented python module every pass.
-* :code:`CacheType.CACHE_PASS_IN_MEM` means the first pass reads data by the user
-  implemented python module, and the rest passes will directly read data from
-  memory.
diff --git a/doc/api/v1/data_provider/src/mnist_config.py b/doc/api/v1/data_provider/src/mnist_config.py
deleted file mode 100644
index d2af9d849e..0000000000
--- a/doc/api/v1/data_provider/src/mnist_config.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-define_py_data_sources2(
-    train_list='train.list',
-    test_list=None,
-    module='mnist_provider',
-    obj='process')
-
-img = data_layer(name='pixel', size=784)
-label = data_layer(name='label', size=10)
diff --git a/doc/api/v1/data_provider/src/mnist_provider.dict.py b/doc/api/v1/data_provider/src/mnist_provider.dict.py
deleted file mode 100644
index 284f7dadb0..0000000000
--- a/doc/api/v1/data_provider/src/mnist_provider.dict.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-
-# Define a py data provider
-@provider(
-    input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)})
-def process(settings, filename):  # settings is not used currently.
-    f = open(filename, 'r')  # open one of training file
-
-    for line in f:  # read each line
-        label, pixel = line.split(';')
-
-        # get features and label
-        pixels_str = pixel.split(' ')
-
-        pixels_float = []
-        for each_pixel_str in pixels_str:
-            pixels_float.append(float(each_pixel_str))
-
-        # give data to paddle.
-        yield {"pixel": pixels_float, 'label': int(label)}
-
-    f.close()  # close file
diff --git a/doc/api/v1/data_provider/src/mnist_train.txt b/doc/api/v1/data_provider/src/mnist_train.txt
deleted file mode 100644
index 34be718ad9..0000000000
--- a/doc/api/v1/data_provider/src/mnist_train.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-5;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.215686 0.533333 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.67451 0.992157 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.070588 0.886275 0.992157 0 0 0 0 0 0 0 0 0 0 0.192157 0.070588 0 0 0 0 0 0 0 0 0 0 0 0 0 0.670588 0.992157 0.992157 0 0 0 0 0 0 0 0 0 0.117647 0.933333 0.858824 0.313725 0 0 0 0 0 0 0 0 0 0 0 0.090196 0.858824 0.992157 0.831373 0 0 0 0 0 0 0 0 0 0.141176 0.992157 0.992157 0.611765 0.054902 0 0 0 0 0 0 0 0 0 0 0.258824 0.992157 0.992157 0.529412 0 0 0 0 0 0 0 0 0 0.368627 0.992157 0.992157 0.419608 0.003922 0 0 0 0 0 0 0 0 0 0.094118 0.835294 0.992157 0.992157 0.517647 0 0 0 0 0 0 0 0 0 0.603922 0.992157 0.992157 0.992157 0.603922 0.545098 0.043137 0 0 0 0 0 0 0 0.447059 0.992157 0.992157 0.956863 0.062745 0 0 0 0 0 0 0 0 0.011765 0.666667 0.992157 0.992157 0.992157 0.992157 0.992157 0.745098 0.137255 0 0 0 0 0 0.152941 0.866667 0.992157 0.992157 0.521569 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.992157 0.803922 0.352941 0.745098 0.992157 0.945098 0.317647 0 0 0 0 0.580392 0.992157 0.992157 0.764706 0.043137 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.776471 0.043137 0 0.007843 0.27451 0.882353 0.941176 0.176471 0 0 0.180392 0.898039 0.992157 0.992157 0.313725 0 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.713725 0 0 0 0 0.627451 0.992157 0.729412 0.062745 0 0.509804 0.992157 0.992157 0.776471 0.035294 0 0 0 0 0 0 0 0 0 0 0.494118 0.992157 0.992157 0.968627 0.168627 0 0 0 0.423529 0.992157 0.992157 0.364706 0 0.717647 0.992157 0.992157 0.317647 0 0 0 0 0 0 0 0 0 0 0 0.533333 0.992157 0.984314 0.945098 0.603922 0 0 0 0.003922 0.466667 0.992157 0.988235 0.976471 0.992157 0.992157 0.788235 0.007843 0 0 0 0 0 0 0 0 0 0 0 0.686275 0.882353 0.364706 0 0 0 0 0 0 0.098039 0.588235 0.992157 0.992157 0.992157 0.980392 0.305882 0 0 0 0 0 0 0 0 0 0 0 0 0.101961 0.67451 0.321569 0 0 0 0 0 0 0 0.105882 0.733333 0.976471 0.811765 0.713725 0 0 0 0 0 0 0 0 0 0 0 0 0 0.65098 0.992157 0.321569 0 0 0 0 0 0 0 0 0 0.25098 0.007843 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0.94902 0.219608 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.968627 0.764706 0.152941 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.498039 0.25098 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
-0;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.298039 0.333333 0.333333 0.333333 0.337255 0.333333 0.333333 0.109804 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.027451 0.223529 0.776471 0.964706 0.988235 0.988235 0.988235 0.992157 0.988235 0.988235 0.780392 0.098039 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.14902 0.698039 0.988235 0.992157 0.988235 0.901961 0.87451 0.568627 0.882353 0.976471 0.988235 0.988235 0.501961 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.188235 0.647059 0.988235 0.988235 0.745098 0.439216 0.098039 0 0 0 0.572549 0.988235 0.988235 0.988235 0 0 0 0 0 0 0 0 0 0 0 0 0 0.2 0.933333 0.992157 0.941176 0.247059 0 0 0 0 0 0 0.188235 0.898039 0.992157 0.992157 0 0 0 0 0 0 0 0 0 0 0 0.039216 0.639216 0.933333 0.988235 0.913725 0.278431 0 0 0 0 0 0 0 0.113725 0.843137 0.988235 0.988235 0 0 0 0 0 0 0 0 0 0 0 0.235294 0.988235 0.992157 0.988235 0.815686 0.07451 0 0 0 0 0 0 0 0.333333 0.988235 0.988235 0.552941 0 0 0 0 0 0 0 0 0 0 0.211765 0.878431 0.988235 0.992157 0.701961 0.329412 0.109804 0 0 0 0 0 0 0 0.698039 0.988235 0.913725 0.145098 0 0 0 0 0 0 0 0 0 0.188235 0.890196 0.988235 0.988235 0.745098 0.047059 0 0 0 0 0 0 0 0 0 0.882353 0.988235 0.568627 0 0 0 0 0 0 0 0 0 0.2 0.933333 0.992157 0.992157 0.992157 0.447059 0.294118 0 0 0 0 0 0 0 0 0.447059 0.992157 0.768627 0 0 0 0 0 0 0 0 0 0 0.623529 0.988235 0.988235 0.988235 0.988235 0.992157 0.47451 0 0 0 0 0 0 0 0.188235 0.933333 0.87451 0.509804 0 0 0 0 0 0 0 0 0 0 0.992157 0.988235 0.937255 0.792157 0.988235 0.894118 0.082353 0 0 0 0 0 0 0.027451 0.647059 0.992157 0.654902 0 0 0 0 0 0 0 0 0 0 0 0.623529 0.988235 0.913725 0.329412 0.376471 0.184314 0 0 0 0 0 0 0.027451 0.513725 0.988235 0.635294 0.219608 0 0 0 0 0 0 0 0 0 0 0 0.196078 0.929412 0.988235 0.988235 0.741176 0.309804 0 0 0 0 0 0 0.529412 0.988235 0.678431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.223529 0.992157 0.992157 1 0.992157 0.992157 0.992157 0.992157 1 0.992157 0.992157 0.882353 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.023529 0.478431 0.654902 0.658824 0.952941 0.988235 0.988235 0.988235 0.992157 0.988235 0.729412 0.278431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.196078 0.647059 0.764706 0.764706 0.768627 0.580392 0.047059 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
-4;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.180392 0.470588 0.623529 0.623529 0.623529 0.588235 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.243137 0.494118 0.862745 0.870588 0.960784 0.996078 0.996078 0.996078 0.996078 0.992157 0.466667 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.317647 0.639216 0.639216 0.639216 0.639216 0.639216 0.470588 0.262745 0.333333 0.929412 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.184314 0.992157 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.192157 0.996078 0.384314 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.454902 0.980392 0.219608 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.564706 0.941176 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.588235 0.776471 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.945098 0.560784 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.054902 0.952941 0.356863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.337255 0.917647 0.109804 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.698039 0.701961 0.019608 0.4 0.662745 0.662745 0.662745 0.662745 0.662745 0.662745 0.662745 0.376471 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.090196 0.639216 0.972549 0.945098 0.913725 0.996078 0.996078 0.996078 0.996078 1 0.996078 0.996078 1 0.996078 0 0 0 0 0 0 0 0 0 0 0.007843 0.105882 0.717647 0.776471 0.905882 0.996078 0.996078 0.988235 0.980392 0.862745 0.537255 0.223529 0.223529 0.368627 0.376471 0.6 0.6 0.6 0 0 0 0 0 0 0 0 0.262745 0.470588 0.6 0.996078 0.996078 0.996078 0.996078 0.847059 0.356863 0.156863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.909804 0.705882 0.823529 0.635294 0.490196 0.219608 0.113725 0.062745 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.152941 0.152941 0.156863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
diff --git a/doc/api/v1/data_provider/src/sentimental_config.py b/doc/api/v1/data_provider/src/sentimental_config.py
deleted file mode 100644
index 56adde13b9..0000000000
--- a/doc/api/v1/data_provider/src/sentimental_config.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dictionary = dict()
-...  #  read dictionary from outside
-
-define_py_data_sources2(
-    train_list='train.list',
-    test_list=None,
-    module='sentimental_provider',
-    obj='process',
-    # above codes same as mnist sample.
-    args={  # pass to provider.
-        'dictionary': dictionary
-    })
diff --git a/doc/api/v1/data_provider/src/sentimental_provider.py b/doc/api/v1/data_provider/src/sentimental_provider.py
deleted file mode 100644
index 59a2b6f7f5..0000000000
--- a/doc/api/v1/data_provider/src/sentimental_provider.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-
-def on_init(settings, dictionary, **kwargs):
-    # on_init will invoke when data provider is initialized. The dictionary
-    # is passed from trainer_config, and is a dict object with type
-    # (word string => word id).
-
-    # set input types in runtime. It will do the same thing as
-    # @provider(input_types) will do, but it is set dynamically during runtime.
-    settings.input_types = {
-        # The text is a sequence of integer values, and each value is a word id.
-        # The whole sequence is the sentences that we want to predict its
-        # sentimental.
-        'data': integer_value_sequence(len(dictionary)),  # text input
-        'label': integer_value(2)  # label positive/negative
-    }
-
-    # save dictionary as settings.dictionary. 
-    # It will be used in process method.
-    settings.dictionary = dictionary
-
-
-@provider(init_hook=on_init)
-def process(settings, filename):
-    f = open(filename, 'r')
-
-    for line in f:  # read each line of file
-        label, sentence = line.split('\t')  # get label and sentence
-        words = sentence.split(' ')  # get words
-
-        # convert word string to word id
-        # the word not in dictionary will be ignored.
-        word_ids = []
-
-        for each_word in words:
-            if each_word in settings.dictionary:
-                word_ids.append(settings.dictionary[each_word])
-
-        # give data to paddle.
-        yield word_ids, int(label)
-
-    f.close()
diff --git a/doc/api/v1/data_provider/src/sentimental_train.txt b/doc/api/v1/data_provider/src/sentimental_train.txt
deleted file mode 100644
index 0060ac267c..0000000000
--- a/doc/api/v1/data_provider/src/sentimental_train.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-0       I saw this movie at the AFI Dallas festival . It all takes place at a lake house and it looks wonderful .
-1       This documentary makes you travel all around the globe . It contains rare and stunning sequels from the wilderness .
-...
diff --git a/doc/api/v1/data_provider/src/train.list b/doc/api/v1/data_provider/src/train.list
deleted file mode 100644
index 92bdc0a8b4..0000000000
--- a/doc/api/v1/data_provider/src/train.list
+++ /dev/null
@@ -1 +0,0 @@
-mnist_train.txt
diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst
deleted file mode 100644
index cf146dc088..0000000000
--- a/doc/api/v1/index_cn.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-API中文手册
-============
-
-DataProvider API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/dataprovider_cn.rst
-    data_provider/pydataprovider2_cn.rst
-
-..  _api_trainer_config:
-
-Model Config API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
-
-
-Applications API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    predict/swig_py_paddle_cn.rst
diff --git a/doc/api/v1/index_en.rst b/doc/api/v1/index_en.rst
deleted file mode 100644
index 10c297a71d..0000000000
--- a/doc/api/v1/index_en.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-API
-===
-
-DataProvider API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/dataprovider_en.rst
-    data_provider/pydataprovider2_en.rst
-
-..  _api_trainer_config:
-
-Model Config API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
-
-
-Applications API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    predict/swig_py_paddle_en.rst
diff --git a/doc/api/v1/predict/src/predict_sample.py b/doc/api/v1/predict/src/predict_sample.py
deleted file mode 100644
index 51349250e8..0000000000
--- a/doc/api/v1/predict/src/predict_sample.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import dense_vector
-from paddle.trainer.config_parser import parse_config
-
-TEST_DATA = [[[
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686, 0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451, 0.992157, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0.070588, 0.886275, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157,
-    0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.670588, 0.992157,
-    0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0.141176, 0.992157, 0.992157, 0.611765, 0.054902, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157, 0.529412, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0.603922, 0.992157, 0.992157, 0.992157, 0.603922,
-    0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157,
-    0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157,
-    0.992157, 0.992157, 0.992157, 0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0,
-    0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098,
-    0.992157, 0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157,
-    0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157,
-    0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471, 0,
-    0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0.070588, 0.992157, 0.992157, 0.713725, 0, 0, 0, 0, 0.627451,
-    0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157, 0.776471,
-    0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157,
-    0.968627, 0.168627, 0, 0, 0, 0.423529, 0.992157, 0.992157, 0.364706, 0,
-    0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922,
-    0.466667, 0.992157, 0.988235, 0.976471, 0.992157, 0.992157, 0.788235,
-    0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275, 0.882353, 0.364706, 0,
-    0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392,
-    0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569,
-    0, 0, 0, 0, 0, 0, 0, 0.105882, 0.733333, 0.976471, 0.811765, 0.713725, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157, 0.321569, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0.968627, 0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039, 0.25098, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0
-]], [[
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255,
-    0.333333, 0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0.027451, 0.223529, 0.776471, 0.964706, 0.988235, 0.988235, 0.988235,
-    0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961,
-    0.87451, 0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.647059, 0.988235, 0.988235,
-    0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235, 0.988235,
-    0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157,
-    0.941176, 0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157,
-    0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.039216, 0.639216, 0.933333,
-    0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137,
-    0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235,
-    0.992157, 0.988235, 0.815686, 0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333,
-    0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.211765,
-    0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0,
-    0, 0, 0.698039, 0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0.188235, 0.890196, 0.988235, 0.988235, 0.745098, 0.047059, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2,
-    0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0,
-    0, 0, 0, 0.447059, 0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0.623529, 0.988235, 0.988235, 0.988235, 0.988235, 0.992157, 0.47451, 0, 0,
-    0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118,
-    0.082353, 0, 0, 0, 0, 0, 0, 0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725, 0.329412, 0.376471,
-    0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294,
-    0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235,
-    0.988235, 0.741176, 0.309804, 0, 0, 0, 0, 0, 0, 0.529412, 0.988235,
-    0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157,
-    0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157,
-    0.882353, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529,
-    0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235, 0.988235,
-    0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627,
-    0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0
-]]]
-
-
-def main():
-    conf = parse_config("./mnist_model/trainer_config.py", "")
-    print conf.data_config.load_data_args
-    network = swig_paddle.GradientMachine.createFromConfigProto(
-        conf.model_config)
-    assert isinstance(network, swig_paddle.GradientMachine)  # For code hint.
-    network.loadParameters("./mnist_model/")
-    converter = DataProviderConverter([dense_vector(784)])
-    inArg = converter(TEST_DATA)
-    print network.forwardTest(inArg)
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=0")
-    main()
diff --git a/doc/api/v1/predict/swig_py_paddle_cn.rst b/doc/api/v1/predict/swig_py_paddle_cn.rst
deleted file mode 100644
index 42f333dba2..0000000000
--- a/doc/api/v1/predict/swig_py_paddle_cn.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-.. _api_swig_py_paddle:
-
-基于Python的预测
-================
-
-预测流程
---------
-
-PaddlePaddle使用swig对常用的预测接口进行了封装，通过编译会生成py_paddle软件包，安装该软件包就可以在python环境下实现模型预测。可以使用python的 ``help()`` 函数查询软件包相关API说明。
-
-基于Python的模型预测，主要包括以下五个步骤。
-
-1. 初始化PaddlePaddle环境
-
-   在程序开始阶段，通过调用 ``swig_paddle.initPaddle()`` 并传入相应的命令行参数初始化PaddlePaddle。
-
-2. 解析模型配置文件
-   
-   初始化之后，可以通过调用 ``parse_config()`` 解析训练模型时用的配置文件。注意预测数据通常不包含label, 同时预测网络通常直接输出最后一层的结果而不是像训练网络一样再接一层cost layer，所以一般需要对训练用的模型配置文件稍作相应修改才能在预测时使用。
-
-3. 构造paddle.GradientMachine
-  
-   通过调用 ``swig_paddle.GradientMachine.createFromConfigproto()`` 传入上一步解析出来的模型配置就可以创建一个 ``GradientMachine``。
-
-4. 准备预测数据
-  
-   swig_paddle中的预测接口的参数是自定义的C++数据类型，py_paddle里面提供了一个工具类 ``DataProviderConverter`` 可以用于接收和PyDataProvider2一样的输入数据并转换成预测接口所需的数据类型。
-
-5. 模型预测
-  
-   通过调用 ``forwardTest()`` 传入预测数据，直接返回计算结果。
-
-
-预测Demo
---------
-
-如下是一段使用mnist model来实现手写识别的预测代码。完整的代码见 ``src_root/doc/ui/predict/predict_sample.py`` 。mnist model可以通过 ``src_root\demo\mnist`` 目录下的demo训练出来。
-
-..  literalinclude:: src/predict_sample.py
-    :language: python
-    :lines: 15-18,121-136
-
-
-Demo预测输出如下，其中value即为softmax层的输出。由于TEST_DATA包含两条预测数据，所以输出的value包含两个向量 。
-
-..  code-block:: text
-
-    [{'id': None, 'value': array(
-      [[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
-          1.43630644e-02,   1.51111044e-13,   9.85625684e-01,
-          2.08823112e-10,   2.32777140e-08,   2.00186201e-09,
-          1.15501715e-08],
-       [  9.99982715e-01,   1.27787406e-10,   1.72296313e-05,
-          1.49316648e-09,   1.36540484e-11,   6.93137714e-10,
-          2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
-          4.48684503e-08]], dtype=float32)}]
-
-
diff --git a/doc/api/v1/predict/swig_py_paddle_en.rst b/doc/api/v1/predict/swig_py_paddle_en.rst
deleted file mode 100644
index 1c628e6971..0000000000
--- a/doc/api/v1/predict/swig_py_paddle_en.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-Python Prediction
-==================
-
-PaddlePaddle offers a set of clean prediction interfaces for python with the help of
-SWIG. The main steps of predict values in python are:
-
-* Parse training configurations
-* Construct GradientMachine
-* Prepare data
-* Predict
-
-Here is a sample python script that shows the typical prediction process for the
-MNIST classification problem. A complete sample code could be found at
-:code:`src_root/doc/ui/predict/predict_sample.py`.
-
-..  literalinclude:: src/predict_sample.py
-    :language: python
-    :lines: 15-18,90-100,101-104
-
-The module that does the most of the job is py_paddle.swig_paddle, it's
-generated by SWIG and has complete documents, for more details you can use
-python's :code:`help()` function. Let's walk through the above python script:
-
-* At the beginning, use :code:`swig_paddle.initPaddle()` to initialize
-  PaddlePaddle with command line arguments, for more about command line arguments
-  see :ref:`cmd_detail_introduction` .
-* Parse the configuration file that is used in training with :code:`parse_config()`.
-  Because data to predict with always have no label, and output of prediction work
-  normally is the output layer rather than the cost layer, so you should modify
-  the configuration file accordingly before using it in the prediction work.
-* Create a neural network with
-  :code:`swig_paddle.GradientMachine.createFromConfigproto()`, which takes the
-  parsed configuration :code:`conf.model_config` as argument. Then load the
-  trained parameters from the model with :code:`network.loadParameters()`.
-* Create a data converter object of utility class :code:`DataProviderConverter`.
-    - Note: As swig_paddle can only accept C++ matrices, we offer a utility
-      class DataProviderConverter that can accept the same input data with
-      PyDataProvider2, for more information please refer to document
-      of :ref:`api_pydataprovider2` .
-* Do the prediction with :code:`forwardTest()`, which takes the converted
-  input data and outputs the activations of the output layer.
-
-Here is a typical output:
-
-..  code-block:: text
-
-    [{'id': None, 'value': array([[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
-          1.43630644e-02,   1.51111044e-13,   9.85625684e-01,
-          2.08823112e-10,   2.32777140e-08,   2.00186201e-09,
-          1.15501715e-08],
-       [  9.99982715e-01,   1.27787406e-10,   1.72296313e-05,
-          1.49316648e-09,   1.36540484e-11,   6.93137714e-10,
-          2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
-          4.48684503e-08]], dtype=float32)}]
-
-:code:`value` is the output of the output layer, each row represents result of
-the corresponding row in the input data, each element represents activation of
-the corresponding neuron in the output layer.
-

From 3a48282e61750688c02ab3330b7373b37d81ee74 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 22 Jan 2018 15:07:08 +0800
Subject: [PATCH 33/54] Fix unitest

---
 paddle/operators/im2sequence_op.cc            | 10 +--
 paddle/operators/im2sequence_op.h             | 16 ++--
 .../v2/fluid/tests/test_im2sequence_op.py     | 73 +++++++------------
 3 files changed, 38 insertions(+), 61 deletions(-)

diff --git a/paddle/operators/im2sequence_op.cc b/paddle/operators/im2sequence_op.cc
index 9c9802c043..1854fc384c 100644
--- a/paddle/operators/im2sequence_op.cc
+++ b/paddle/operators/im2sequence_op.cc
@@ -64,11 +64,11 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(LodTensor)The output data of im2sequence op,");
     AddAttr<std::vector<int>>("kernels",
                               "(vector<int>), the "
-                              "kernels(kernel_height, kernel_width)")
-        AddAttr<std::vector<int>>("strides",
-                                  "(vector<int> default:{1, 1}), the "
-                                  "strides(h_stride, w_stride)")
-            .SetDefault({1, 1});
+                              "kernels(kernel_height, kernel_width)");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int> default:{1, 1}), the "
+                              "strides(h_stride, w_stride)")
+        .SetDefault({1, 1});
     AddAttr<std::vector<int>>("paddings",
                               "(vector<int> default:{0, 0, 0, 0}), the "
                               "paddings(up_pad, left_pad, down_pad, right_pad)")
diff --git a/paddle/operators/im2sequence_op.h b/paddle/operators/im2sequence_op.h
index 352d290b1b..aeb8100151 100644
--- a/paddle/operators/im2sequence_op.h
+++ b/paddle/operators/im2sequence_op.h
@@ -50,11 +50,11 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
-    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    int output_height =
-        OutputSize(img_height, kernels[0], paddings[0], paddings[2] strides[0]);
+    auto kernels = ctx.Attr<std::vector<int>>("kernels");
+    auto strides = ctx.Attr<std::vector<int>>("strides");
+    auto paddings = ctx.Attr<std::vector<int>>("paddings");
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
     int output_width =
         OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
 
@@ -106,9 +106,9 @@ class Im2SequenceGradKernel : public framework::OpKernel<T> {
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
-    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    auto kernels = ctx.Attr<std::vector<int>>("kernels");
+    auto strides = ctx.Attr<std::vector<int>>("strides");
+    auto paddings = ctx.Attr<std::vector<int>>("paddings");
     int output_height = OutputSize(img_height, kernels[0], paddings[0],
                                    paddings[2], strides[0]);
     int output_width =
diff --git a/python/paddle/v2/fluid/tests/test_im2sequence_op.py b/python/paddle/v2/fluid/tests/test_im2sequence_op.py
index cd1b2164f0..2cab3e31a5 100644
--- a/python/paddle/v2/fluid/tests/test_im2sequence_op.py
+++ b/python/paddle/v2/fluid/tests/test_im2sequence_op.py
@@ -20,22 +20,19 @@ def get_output_shape(attrs, in_shape):
     img_height = in_shape[2]
     img_width = in_shape[3]
 
-    padding_height = attrs['padding_height']
-    padding_width = attrs['padding_width']
-    block_height = attrs['block_height']
-    block_width = attrs['block_width']
-    stride_height = attrs['stride_height']
-    stride_width = attrs['stride_width']
+    paddings = attrs['paddings']
+    kernels = attrs['kernels']
+    strides = attrs['strides']
 
     output_height = \
       1 +  \
-      (img_height + 2 * padding_height - block_height + stride_height - 1) / \
-          stride_height
+      (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+          strides[0]
 
     output_width = \
       1 + \
-      (img_width + 2 * padding_width - block_width + stride_width - 1) / \
-          stride_width
+      (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+          strides[1]
 
     return output_height, output_width
 
@@ -46,19 +43,11 @@ def im2col(attrs, im, col):
     col:
         {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
     """
-    input_channels = im.shape[0]
-    input_height = im.shape[1]
-    input_width = im.shape[2]
+    input_channels, input_height, input_width = im.shape
+    output_height, output_width, _, filter_height, filter_width = col.shape
 
-    output_height = col.shape[0]
-    output_width = col.shape[1]
-    filter_height = col.shape[3]
-    filter_width = col.shape[4]
-
-    stride_height = attrs['stride_height']
-    stride_width = attrs['stride_width']
-    padding_height = attrs['padding_height']
-    padding_width = attrs['padding_width']
+    stride_height, stride_width = attrs['strides']
+    padding_height, padding_width = attrs['paddings'][0:2]
 
     for col_row_idx in range(0, output_height):
         for col_col_idx in range(0, output_width):
@@ -92,7 +81,7 @@ def Im2Sequence(inputs, attrs):
     batch_size = inputs.shape[0]
     out = np.zeros([
         batch_size, output_height, output_width, img_channels,
-        attrs['block_height'], attrs['block_width']
+        attrs['kernels'][0], attrs['kernels'][1]
     ]).astype("float32")
 
     for i in range(len(inputs)):
@@ -100,7 +89,7 @@ def Im2Sequence(inputs, attrs):
 
     out = out.reshape([
         batch_size * output_height * output_width,
-        img_channels * attrs['block_height'] * attrs['block_width']
+        img_channels * attrs['kernels'][0] * attrs['kernels'][1]
     ])
     return out
 
@@ -112,12 +101,9 @@ class TestBlockExpandOp(OpTest):
         self.img_height = 4
         self.img_width = 4
         self.attrs = {
-            'block_height': 2,
-            'block_width': 2,
-            'stride_height': 1,
-            'stride_width': 1,
-            'padding_height': 1,
-            'padding_width': 1,
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [1, 1, 1, 1]
         }
 
     def setUp(self):
@@ -145,12 +131,9 @@ class TestBlockExpandOpCase2(TestBlockExpandOp):
         self.img_height = 4
         self.img_width = 5
         self.attrs = {
-            'block_height': 2,
-            'block_width': 1,
-            'stride_height': 2,
-            'stride_width': 1,
-            'padding_height': 2,
-            'padding_width': 1,
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 1, 2, 1]
         }
 
 
@@ -161,12 +144,9 @@ class TestBlockExpandOpCase3(TestBlockExpandOp):
         self.img_height = 4
         self.img_width = 5
         self.attrs = {
-            'block_height': 2,
-            'block_width': 1,
-            'stride_height': 2,
-            'stride_width': 1,
-            'padding_height': 2,
-            'padding_width': 0,
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 0, 2, 0]
         }
 
 
@@ -177,12 +157,9 @@ class TestBlockExpandOpCase4(TestBlockExpandOp):
         self.img_height = 3
         self.img_width = 3
         self.attrs = {
-            'block_height': 2,
-            'block_width': 2,
-            'stride_height': 1,
-            'stride_width': 1,
-            'padding_height': 0,
-            'padding_width': 0,
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [0, 0, 0, 0]
         }
 
 

From b4d6e7cfc3af432338f83817d19d6122428c3fc1 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 22 Jan 2018 15:29:00 +0800
Subject: [PATCH 34/54] add v2 get layer out in faq

---
 doc/faq/local/index_cn.rst | 43 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index b331d9d36e..5e3f385f8d 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -211,3 +211,46 @@ decoder_inputs = paddle.layer.fc(
 * list 中元素的个数等于网络中输出层的个数；
 * list 中每个元素是一个layer的输出结果矩阵，类型是numpy的ndarray；
 * 每一个layer输出矩阵的高度，在非序列输入时：等于样本数；序列输入时等于：输入序列中元素的总数；宽度等于配置中layer的size；
+
+6.  如何在训练过程中获得某一个layer的output
+-----------------------------------------------
+
+可以在event_handler中，通过 :code:`event.gm.getLayerOutputs("layer_name")` 获得在模型配置中某一层的name :code:`layer_name` 在当前
+mini-batch forward的output的值。获得的值类型均为 :code:`numpy.ndarray` ，可以通过这个输出来完成自定义的评估指标计算等功能。例如下面代码：
+
+..      code-block:: python
+
+        def score_diff(right_score, left_score):
+            return np.average(np.abs(right_score - left_score))
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 25 == 0:
+                    diff = score_diff(
+                        event.gm.getLayerOutputs("right_score")["right_score"][
+                            "value"],
+                        event.gm.getLayerOutputs("left_score")["left_score"][
+                            "value"])
+                    logger.info(("Pass %d Batch %d : Cost %.6f, "
+                                "average absolute diff scores: %.6f") %
+                                (event.pass_id, event.batch_id, event.cost, diff))
+
+
+6.  如何在训练过程中获得参数的权重和梯度
+-----------------------------------------------
+
+在某些情况下，获得当前mini-batch的权重（或称作weights, parameters）有助于在训练时观察具体数值，方便排查以及快速定位问题。
+可以通过在 :code:`event_handler` 中打印其值（注意，需要使用 :code:`paddle.event.EndForwardBackward` 保证使用GPU训练时也可以获得），
+示例代码如下：
+
+..      code-block:: python
+
+        ...
+        parameters = paddle.parameters.create(cost)
+        ...
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndForwardBackward):
+                if event.batch_id % 25 == 0:
+                    for p in parameters.keys():
+                        logger.info("Param %s, Grad %s",
+                                (parameters.get(p), parameters.get_grad(p))

From c0da87f39916e90de3e6c2cf127af73692069a17 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 22 Jan 2018 15:30:02 +0800
Subject: [PATCH 35/54] add TODO in WITH_FLUID option

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad1b6f23c9..f4e7d5c20d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,7 +55,8 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
-option(WITH_FLUID       "Compile PaddlePaddle fluid only"               ON)
+# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
+option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         ON)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)

From 1c4968ee8b89b52e78d46eff6e6f2d6bb60f3136 Mon Sep 17 00:00:00 2001
From: ying <lcy.seso@gmail.com>
Date: Mon, 22 Jan 2018 15:34:13 +0800
Subject: [PATCH 36/54] fix copyright

---
 .../test_memopt_fit_a_line.py                      | 14 ++++++++++++++
 .../test_memopt_image_classification_train.py      | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 6206fcc4be..cf054bb0fe 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index cc37f773c4..42b3cb81ce 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function
 
 import sys

From ccbe7239283be2d7ba8c2eaa234378641ceee75d Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 22 Jan 2018 15:34:43 +0800
Subject: [PATCH 37/54] update sample code

---
 doc/faq/local/index_cn.rst | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index 5e3f385f8d..efafaaab35 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -236,7 +236,7 @@ mini-batch forward的output的值。获得的值类型均为 :code:`numpy.ndarra
                                 (event.pass_id, event.batch_id, event.cost, diff))
 
 
-6.  如何在训练过程中获得参数的权重和梯度
+7.  如何在训练过程中获得参数的权重和梯度
 -----------------------------------------------
 
 在某些情况下，获得当前mini-batch的权重（或称作weights, parameters）有助于在训练时观察具体数值，方便排查以及快速定位问题。
@@ -253,4 +253,7 @@ mini-batch forward的output的值。获得的值类型均为 :code:`numpy.ndarra
                 if event.batch_id % 25 == 0:
                     for p in parameters.keys():
                         logger.info("Param %s, Grad %s",
-                                (parameters.get(p), parameters.get_grad(p))
+                            parameters.get(p), parameters.get_grad(p))
+
+注意：“在训练过程中获得某一个layer的output”和“在训练过程中获得参数的权重和梯度”都会造成训练中的数据从C++拷贝到numpy，会对训练性能造成影响。不要在
+     注重性能的训练场景下使用。
\ No newline at end of file

From ded495336e0803dfc63e7ea479d851f677d15ff7 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 22 Jan 2018 15:36:51 +0800
Subject: [PATCH 38/54] update style

---
 doc/faq/local/index_cn.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index efafaaab35..d0cdba90d8 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -255,5 +255,4 @@ mini-batch forward的output的值。获得的值类型均为 :code:`numpy.ndarra
                         logger.info("Param %s, Grad %s",
                             parameters.get(p), parameters.get_grad(p))
 
-注意：“在训练过程中获得某一个layer的output”和“在训练过程中获得参数的权重和梯度”都会造成训练中的数据从C++拷贝到numpy，会对训练性能造成影响。不要在
-     注重性能的训练场景下使用。
\ No newline at end of file
+注意：“在训练过程中获得某一个layer的output”和“在训练过程中获得参数的权重和梯度”都会造成训练中的数据从C++拷贝到numpy，会对训练性能造成影响。不要在注重性能的训练场景下使用。
\ No newline at end of file

From 160aa64132e5d6a371e01bf55eca668a5601c0c7 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 22 Jan 2018 16:31:20 +0800
Subject: [PATCH 39/54] follow comments

---
 doc/faq/local/index_cn.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index d0cdba90d8..0306b1e5dd 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -235,6 +235,7 @@ mini-batch forward的output的值。获得的值类型均为 :code:`numpy.ndarra
                                 "average absolute diff scores: %.6f") %
                                 (event.pass_id, event.batch_id, event.cost, diff))
 
+注意：此方法不能获取 :code:`paddle.layer.recurrent_group` 里step的内容，但可以获取 :code:`paddle.layer.recurrent_group` 的输出。
 
 7.  如何在训练过程中获得参数的权重和梯度
 -----------------------------------------------

From 89c591f37cf50edbf32ef418696e856fb506f83d Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 22 Jan 2018 16:54:24 +0800
Subject: [PATCH 40/54] update grad clip api

---
 python/paddle/v2/fluid/clip.py                 | 18 +++++++++++-------
 .../v2/fluid/tests/test_gradient_clip.py       |  3 ++-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py
index 386df9823d..3028029e60 100644
--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
@@ -12,14 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
+
 import functools
 import layers
 import framework
 from . import core
 
 __all__ = [
-    'GradientClipByValue',
     'ErrorClipByValue',
+    'GradientClipByValue',
+    'GradientClipByNorm',
+    'GradientClipByGlobalNorm',
     'append_gradient_clip_ops',
     'error_clip_callback',
 ]
@@ -155,10 +159,11 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
         return param, new_grad
 
 
-def gradient_clip_by_global_norm(clip_norm,
-                                 param_list=None,
-                                 group_name="default_group",
-                                 program=None):
+def set_gradient_clip(clip, param_list=None, program=None):
+    if not isinstance(clip, BaseGradientClipAttr):
+        raise TypeError(
+            "'clip' should be an instance of BaseGradientClipAttr's derived class"
+        )
     if program is None:
         program = framework.default_main_program()
     if param_list is None:
@@ -171,8 +176,7 @@ def gradient_clip_by_global_norm(clip_norm,
         )
 
     for param in param_list:
-        param.gradient_clip_attr = GradientClipByGlobalNorm(clip_norm,
-                                                            group_name)
+        param.gradient_clip_attr = copy.deepcopy(clip)
 
 
 def append_gradient_clip_ops(param_grad):
diff --git a/python/paddle/v2/fluid/tests/test_gradient_clip.py b/python/paddle/v2/fluid/tests/test_gradient_clip.py
index 4e6e6a1ef6..9337791c21 100644
--- a/python/paddle/v2/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/v2/fluid/tests/test_gradient_clip.py
@@ -40,7 +40,8 @@ p_g = fluid.backward.append_backward(loss=avg_cost)
 p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
 
 with fluid.program_guard(main_program=prog_clip):
-    fluid.clip.gradient_clip_by_global_norm(clip_norm=CLIP)
+    fluid.clip.set_gradient_clip(
+        fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
     p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
 
 grad_list = [elem[1] for elem in p_g]

From 1bc8de32091d11a57cda7af0b38b1766d51a06d5 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 22 Jan 2018 16:59:54 +0800
Subject: [PATCH 41/54] 1. Add sequence_num as edit distance op's output 2. Fix
 evaluator using 'reduce_sum' op instead of 'mean' op

---
 paddle/operators/CMakeLists.txt               |  1 +
 paddle/operators/edit_distance_op.cc          |  4 ++++
 paddle/operators/edit_distance_op.cu          |  9 +++++++-
 paddle/operators/edit_distance_op.h           |  4 +++-
 python/paddle/v2/fluid/evaluator.py           | 22 +++++++++----------
 python/paddle/v2/fluid/layers/nn.py           |  6 +++--
 .../v2/fluid/tests/test_edit_distance_op.py   |  6 +++--
 7 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 6745a8da17..15f7cb6b56 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -156,6 +156,7 @@ op_library(parallel_do_op DEPS executor)
 # Regist multiple Kernel to pybind
 if (WITH_GPU)
 op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
+op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
 op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
 op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
   conv_transpose_cudnn_op.cu.cc DEPS vol2col)
diff --git a/paddle/operators/edit_distance_op.cc b/paddle/operators/edit_distance_op.cc
index 62a1fcebe7..7e7dfc79eb 100644
--- a/paddle/operators/edit_distance_op.cc
+++ b/paddle/operators/edit_distance_op.cc
@@ -25,6 +25,8 @@ class EditDistanceOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("Hyps"), "Input(Hyps) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("Refs"), "Input(Refs) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SequenceNum"),
+                   "Output(SequenceNum) shouldn't be null.");
     auto hyp_dims = ctx->GetInputDim("Hyps");
     auto ref_dims = ctx->GetInputDim("Refs");
     PADDLE_ENFORCE(hyp_dims.size() == 2 && hyp_dims[1] == 1,
@@ -34,6 +36,7 @@ class EditDistanceOp : public framework::OperatorWithKernel {
                    "Input(Refs) must be a 2-D LoDTensor with the 2nd dimension "
                    "equal to 1.");
     ctx->SetOutputDim("Out", ctx->GetInputDim("Refs"));
+    ctx->SetOutputDim("SequenceNum", {1});
   }
 
  protected:
@@ -54,6 +57,7 @@ class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Refs",
              "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
              "The indices for reference strings.");
+    AddOutput("SequenceNum", "The sequence count of current batch");
     AddAttr<bool>("normalized",
                   "(bool, default false) Indicated whether to normalize "
                   "the edit distance by the length of reference string.")
diff --git a/paddle/operators/edit_distance_op.cu b/paddle/operators/edit_distance_op.cu
index 338fd79bcc..c3e116af08 100644
--- a/paddle/operators/edit_distance_op.cu
+++ b/paddle/operators/edit_distance_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/platform/cuda_helper.h"
 #include "paddle/platform/gpu_info.h"
 
@@ -72,6 +73,8 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
 
     auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
     auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
+    auto* sequence_num = ctx.Output<framework::Tensor>("SequenceNum");
+    sequence_num->mutable_data<int64_t>(ctx.GetPlace());
 
     auto normalized = ctx.Attr<bool>("normalized");
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
@@ -88,7 +91,11 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
                      "Reference string %d is empty.", i);
     }
 
-    auto num_strs = hyp_lod.size() - 1;
+    const size_t num_strs = hyp_lod.size() - 1;
+    math::SetConstant<platform::CUDADeviceContext, int64_t> set_constant;
+    set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
+                 sequence_num, static_cast<int64_t>(num_strs));
+
     out_t->Resize({static_cast<int64_t>(num_strs), 1});
     out_t->mutable_data<T>(ctx.GetPlace());
     auto out = out_t->data<T>();
diff --git a/paddle/operators/edit_distance_op.h b/paddle/operators/edit_distance_op.h
index 4c5a29813c..974299e604 100644
--- a/paddle/operators/edit_distance_op.h
+++ b/paddle/operators/edit_distance_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-
 namespace paddle {
 namespace operators {
 
@@ -28,6 +27,8 @@ class EditDistanceKernel : public framework::OpKernel<T> {
 
     auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
     auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
+    auto* sequence_num = ctx.Output<framework::Tensor>("SequenceNum");
+    int64_t* seq_num_data = sequence_num->mutable_data<int64_t>(ctx.GetPlace());
 
     auto normalized = ctx.Attr<bool>("normalized");
 
@@ -41,6 +42,7 @@ class EditDistanceKernel : public framework::OpKernel<T> {
                      "Reference string %d is empty.", i);
     }
     auto num_strs = hyp_lod.size() - 1;
+    *seq_num_data = static_cast<int64_t>(num_strs);
 
     out_t->Resize({static_cast<int64_t>(num_strs), 1});
     out_t->mutable_data<float>(ctx.GetPlace());
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index 351db4f12d..67e99a70ad 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -219,15 +219,14 @@ class EditDistance(Evaluator):
 
         self.total_error = self.create_state(
             dtype='float32', shape=[1], suffix='total')
-        self.batch_num = self.create_state(
-            dtype='float32', shape=[1], suffix='total')
-        error = layers.edit_distance(input=input, label=label)
-        error = layers.cast(x=error, dtype='float32')
-        mean_error = layers.mean(x=error)
-        layers.sums(input=[self.total_error, mean_error], out=self.total_error)
-        const1 = layers.fill_constant(shape=[1], value=1.0, dtype="float32")
-        layers.sums(input=[self.batch_num, const1], out=self.batch_num)
-        self.metrics.append(mean_error)
+        self.seq_num = self.create_state(
+            dtype='int64', shape=[1], suffix='total')
+        error, seq_num = layers.edit_distance(input=input, label=label)
+        #error = layers.cast(x=error, dtype='float32')
+        sum_error = layers.reduce_sum(error)
+        layers.sums(input=[self.total_error, sum_error], out=self.total_error)
+        layers.sums(input=[self.seq_num, seq_num], out=self.seq_num)
+        self.metrics.append(sum_error)
 
     def eval(self, executor, eval_program=None):
         if eval_program is None:
@@ -235,6 +234,7 @@ class EditDistance(Evaluator):
         block = eval_program.current_block()
         with program_guard(main_program=eval_program):
             total_error = _clone_var_(block, self.total_error)
-            batch_num = _clone_var_(block, self.batch_num)
-            out = layers.elementwise_div(x=total_error, y=batch_num)
+            seq_num = _clone_var_(block, self.seq_num)
+            seq_num = layers.cast(x=seq_num, dtype='float32')
+            out = layers.elementwise_div(x=total_error, y=seq_num)
         return np.array(executor.run(eval_program, fetch_list=[out])[0])
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index c57811df1d..9a1fc2f120 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1918,14 +1918,16 @@ def edit_distance(input, label, normalized=False, tokens=None, name=None):
 
     # edit distance op
     edit_distance_out = helper.create_tmp_variable(dtype="int64")
+    sequence_num = helper.create_tmp_variable(dtype="int64")
     helper.append_op(
         type="edit_distance",
         inputs={"Hyps": [input],
                 "Refs": [label]},
-        outputs={"Out": [edit_distance_out]},
+        outputs={"Out": [edit_distance_out],
+                 "SequenceNum": [sequence_num]},
         attrs={"normalized": normalized})
 
-    return edit_distance_out
+    return edit_distance_out, sequence_num
 
 
 def ctc_greedy_decoder(input, blank, name=None):
diff --git a/python/paddle/v2/fluid/tests/test_edit_distance_op.py b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
index 5f5634e297..01e7e64d05 100644
--- a/python/paddle/v2/fluid/tests/test_edit_distance_op.py
+++ b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
@@ -60,6 +60,7 @@ class TestEditDistanceOp(OpTest):
 
         num_strs = len(x1_lod) - 1
         distance = np.zeros((num_strs, 1)).astype("float32")
+        sequence_num = np.array(2).astype("int64")
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
                 hyp=x1[x1_lod[i]:x1_lod[i + 1]],
@@ -69,7 +70,7 @@ class TestEditDistanceOp(OpTest):
                 distance[i] = distance[i] / len_ref
         self.attrs = {'normalized': normalized}
         self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
-        self.outputs = {'Out': distance}
+        self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
 
     def test_check_output(self):
         self.check_output()
@@ -88,6 +89,7 @@ class TestEditDistanceOpNormalized(OpTest):
 
         num_strs = len(x1_lod) - 1
         distance = np.zeros((num_strs, 1)).astype("float32")
+        sequence_num = np.array(3).astype("int64")
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
                 hyp=x1[x1_lod[i]:x1_lod[i + 1]],
@@ -97,7 +99,7 @@ class TestEditDistanceOpNormalized(OpTest):
                 distance[i] = distance[i] / len_ref
         self.attrs = {'normalized': normalized}
         self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
-        self.outputs = {'Out': distance}
+        self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
 
     def test_check_output(self):
         self.check_output()

From 8143a42667d3dd158a464449e3492b7b0acf55c7 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 22 Jan 2018 17:34:45 +0800
Subject: [PATCH 42/54] 1. Add more comments

---
 python/paddle/v2/fluid/evaluator.py | 36 +++++++++++++++++++++++++----
 python/paddle/v2/fluid/layers/nn.py | 16 ++++++++-----
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index 67e99a70ad..5dde8d623a 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -208,20 +208,46 @@ class ChunkEvaluator(Evaluator):
 
 class EditDistance(Evaluator):
     """
-    Average edit distance error for multiple mini-batches.
+    Accumulate edit distance sum and sequence number from mini-batches and
+    compute the average edit_distance of all batches.
+
+    Args:
+        input: the sequences predicted by network
+        label: the target sequences which must has same sequence count
+        with input.
+        ignored_tokens(list of int): Tokens that should be removed before
+        calculating edit distance.
+
+    Example:
+
+        exe = fluid.executor(place)
+        distance_evaluator = fluid.Evaluator.EditDistance(input, label)
+        for epoch in PASS_NUM:
+            distance_evaluator.reset(exe)
+            for data in batches:
+                loss, sum_distance = exe.run(fetch_list=[cost] + distance_evaluator.metrics)
+                avg_distance = distance_evaluator.eval(exe)
+            pass_distance = distance_evaluator.eval(exe)
+
+        In the above example:
+        'sum_distance' is the sum of the batch's edit distance.
+        'avg_distance' is the average of edit distance from the firt batch to the current batch.
+        'pass_distance' is the average of edit distance from all the pass.
+
     """
 
-    def __init__(self, input, label, k=1, **kwargs):
+    def __init__(self, input, label, ignored_tokens=None, **kwargs):
         super(EditDistance, self).__init__("edit_distance", **kwargs)
         main_program = self.helper.main_program
         if main_program.current_block().idx != 0:
             raise ValueError("You can only invoke Evaluator in root block")
 
         self.total_error = self.create_state(
-            dtype='float32', shape=[1], suffix='total')
+            dtype='float32', shape=[1], suffix='total_error')
         self.seq_num = self.create_state(
-            dtype='int64', shape=[1], suffix='total')
-        error, seq_num = layers.edit_distance(input=input, label=label)
+            dtype='int64', shape=[1], suffix='seq_num')
+        error, seq_num = layers.edit_distance(
+            input=input, label=label, ignored_tokens=ignored_tokens)
         #error = layers.cast(x=error, dtype='float32')
         sum_error = layers.reduce_sum(error)
         layers.sums(input=[self.total_error, sum_error], out=self.total_error)
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 9a1fc2f120..7dd77aca95 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1864,7 +1864,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
-def edit_distance(input, label, normalized=False, tokens=None, name=None):
+def edit_distance(input,
+                  label,
+                  normalized=False,
+                  ignored_tokens=None,
+                  name=None):
     """
     EditDistance operator computes the edit distances between a batch of hypothesis strings and their references.Edit distance, also called Levenshtein distance, measures how dissimilar two strings are by counting the minimum number of operations to transform one string into anthor. Here the operations include insertion, deletion, and substitution. For example, given hypothesis string A = "kitten" and reference B = "sitting", the edit distance is 3 for A will be transformed into B at least after two substitutions and one insertion:
 
@@ -1882,10 +1886,10 @@ def edit_distance(input, label, normalized=False, tokens=None, name=None):
 
         normalized(bool): Indicated whether to normalize the edit distance by the length of reference string.
 
-        tokens(list): Tokens that should be removed before calculating edit distance.
+        ignored_tokens(list of int): Tokens that should be removed before calculating edit distance.
 
     Returns:
-        Variable: sequence-to-sequence edit distance loss in shape [batch_size, 1].
+        Variable: sequence-to-sequence edit distance in shape [batch_size, 1].
 
     Examples:
         .. code-block:: python
@@ -1898,7 +1902,7 @@ def edit_distance(input, label, normalized=False, tokens=None, name=None):
     helper = LayerHelper("edit_distance", **locals())
 
     # remove some tokens from input and labels
-    if tokens is not None and len(tokens) > 0:
+    if ignored_tokens is not None and len(ignored_tokens) > 0:
         erased_input = helper.create_tmp_variable(dtype="int64")
         erased_label = helper.create_tmp_variable(dtype="int64")
 
@@ -1906,14 +1910,14 @@ def edit_distance(input, label, normalized=False, tokens=None, name=None):
             type="sequence_erase",
             inputs={"X": [input]},
             outputs={"Out": [erased_input]},
-            attrs={"tokens": tokens})
+            attrs={"tokens": ignored_tokens})
         input = erased_input
 
         helper.append_op(
             type="sequence_erase",
             inputs={"X": [label]},
             outputs={"Out": [erase_label]},
-            attrs={"tokens": tokens})
+            attrs={"tokens": ignored_tokens})
         label = erased_label
 
     # edit distance op

From 38f47e642f277a4eba73221cd480f9e5ceda9588 Mon Sep 17 00:00:00 2001
From: Yang Yu <yuyang18@baidu.com>
Date: Mon, 22 Jan 2018 17:59:24 +0800
Subject: [PATCH 43/54] Fix CI

---
 python/paddle/v2/fluid/layers/math_op_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py
index 11197b70a3..f359e70126 100644
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from ..framework import Variable, unique_name
-from ..registry import OpProtoHolder
+from layer_function_generator import OpProtoHolder
 
 __all__ = ['monkey_patch_variable']
 

From 07908686d5bf63d01a313bb65d2e2c081e65db8d Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Mon, 22 Jan 2018 18:52:06 +0800
Subject: [PATCH 44/54] Update some comments and add more check.

---
 paddle/operators/bipartite_match_op.cc | 72 +++++++++++++++-----------
 1 file changed, 42 insertions(+), 30 deletions(-)

diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc
index 8dbade65a5..c2d30c7d92 100644
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
@@ -21,6 +21,8 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
+constexpr char kEPS = 1e-6;
+
 class BipartiteMatchOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -41,12 +43,13 @@ template <typename T>
 class BipartiteMatchKernel : public framework::OpKernel<T> {
  public:
   // The match_indices must be initialized to -1 at first.
-  // The match_dis must be initialized to 0 at first.
-  void BipartiteMatch(const Tensor& dis, int* match_indices,
-                      T* match_dis) const {
-    int64_t row = dis.dims()[0];
-    int64_t col = dis.dims()[1];
-    auto* dis_data = dis.data<T>();
+  // The match_dist must be initialized to 0 at first.
+  void BipartiteMatch(const Tensor& dist, int* match_indices,
+                      T* match_dist) const {
+    PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
+    int64_t row = dist.dims()[0];
+    int64_t col = dist.dims()[1];
+    auto* dist_data = dist.data<T>();
     std::vector<int> row_pool;
     for (int i = 0; i < row; ++i) {
       row_pool.push_back(i);
@@ -54,7 +57,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     while (row_pool.size() > 0) {
       int max_idx = -1;
       int max_row_idx = -1;
-      T max_dis = -1;
+      T max_dist = -1;
       for (int64_t j = 0; j < col; ++j) {
         if (match_indices[j] != -1) {
           continue;
@@ -62,13 +65,13 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
         for (int k = 0; k < row_pool.size(); ++k) {
           int m = row_pool[k];
           // distance is 0 between m-th row and j-th column
-          if (dis_data[m * col + j] < 1e-6) {
+          if (dist_data[m * col + j] < kEPS) {
             continue;
           }
-          if (dis_data[m * col + j] > max_dis) {
+          if (dist_data[m * col + j] > max_dist) {
             max_idx = j;
             max_row_idx = m;
-            max_dis = dis_data[m * col + j];
+            max_dist = dist_data[m * col + j];
           }
         }
       }
@@ -78,7 +81,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
       } else {
         PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
         match_indices[max_idx] = max_row_idx;
-        match_dis[max_idx] = max_dis;
+        match_dist[max_idx] = max_dist;
         // Erase the row index.
         row_pool.erase(
             std::find(row_pool.begin(), row_pool.end(), max_row_idx));
@@ -87,34 +90,38 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* dis_mat = context.Input<LoDTensor>("DisMat");
+    auto* dist_mat = context.Input<LoDTensor>("DisMat");
     auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dis = context.Output<Tensor>("ColToRowMatchDis");
+    auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
 
     auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
 
-    auto col = dis_mat->dims()[1];
+    auto col = dist_mat->dims()[1];
 
-    int64_t n = dis_mat->lod().size() == 0
+    int64_t n = dist_mat->lod().size() == 0UL
                     ? 1
-                    : static_cast<int64_t>(dis_mat->lod().back().size() - 1);
+                    : static_cast<int64_t>(dist_mat->lod().back().size() - 1);
+    if (dist_mat->lod().size()) {
+      PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
     match_indices->mutable_data<int>({n, col}, context.GetPlace());
-    match_dis->mutable_data<T>({n, col}, context.GetPlace());
+    match_dist->mutable_data<T>({n, col}, context.GetPlace());
 
     math::SetConstant<platform::CPUDeviceContext, int> iset;
     iset(dev_ctx, match_indices, static_cast<int>(-1));
     math::SetConstant<platform::CPUDeviceContext, T> tset;
-    tset(dev_ctx, match_dis, static_cast<T>(0));
+    tset(dev_ctx, match_dist, static_cast<T>(0));
 
     int* indices = match_indices->data<int>();
-    T* dis = match_dis->data<T>();
+    T* dist = match_dist->data<T>();
     if (n == 1) {
-      BipartiteMatch(*dis_mat, indices, dis);
+      BipartiteMatch(*dist_mat, indices, dist);
     } else {
-      auto lod = dis_mat->lod().back();
+      auto lod = dist_mat->lod().back();
       for (size_t i = 0; i < lod.size() - 1; ++i) {
-        Tensor one_ins = dis_mat->Slice(lod[i], lod[i + 1]);
-        BipartiteMatch(one_ins, indices + i * col, dis + i * col);
+        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
+        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
       }
     }
   }
@@ -131,7 +138,7 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
         "represented by each row and each column. For example, assumed one "
         "entity is A with shape [K], another entity is B with shape [M]. The "
         "DisMat[i][j] is the distance between A[i] and B[j]. The bigger "
-        "the distance is, the more similar the pairs are. Please note, "
+        "the distance is, the better macthing the pairs are. Please note, "
         "This tensor can contain LoD information to represent a batch of "
         "inputs. One instance of this batch can contain different numbers of "
         "entities.");
@@ -140,20 +147,25 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
               "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
               "means B[j] does not match any entity in i-th instance. "
               "Otherwise, it means B[j] is matched to row "
-              "RowToColMatchIndices[i][j] in i-th instance. The row number of "
-              "i-th instance is saved in RowToColMatchIndices[i][j].");
+              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
+              "i-th instance is saved in ColToRowMatchIndices[i][j].");
     AddOutput("ColToRowMatchDis",
               "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
               "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
               "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
-              "RowToColMatchIndices[i][j] = d, and the row offsets of each "
+              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
               "instance are called LoD. Then "
               "ColToRowMatchDis[i][j] = DisMat[d+LoD[i]][j]");
     AddComment(R"DOC(
 This operator is a greedy bipartite matching algorithm, which is used to
-obtain the matching with the (greedy) maximum distance based on the input
-distance matrix. There are two outputs to save matched indices and distance.
-And this operator only calculate matched indices from column to row.
+obtain the matching with the maximum distance based on the input
+distance matrix. For input 2D matrix, the bipartite matching algorithm can
+find the matched column for each row, also can find the matched row for
+each column. And this operator only calculate matched indices from column
+to row. For each instance, the number of matched indices is the number of
+of columns of the input ditance matrix.
+
+There are two outputs to save matched indices and distance.
 A simple description, this algothrim matched the best (maximum distance)
 row entity to the column entity and the matched indices are not duplicated
 in each row of ColToRowMatchIndices. If the column entity is not matched

From d9d9be1bac627d5314accdf89a4367bc3a2f0294 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 22 Jan 2018 19:14:47 +0800
Subject: [PATCH 45/54] Fix white space in comments.

---
 python/paddle/v2/fluid/evaluator.py | 2 +-
 python/paddle/v2/fluid/layers/nn.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index 5dde8d623a..933f91dcfe 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -212,7 +212,7 @@ class EditDistance(Evaluator):
     compute the average edit_distance of all batches.
 
     Args:
-        input: the sequences predicted by network
+        input: the sequences predicted by network.
         label: the target sequences which must has same sequence count
         with input.
         ignored_tokens(list of int): Tokens that should be removed before
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 7dd77aca95..5b53f5d64e 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1870,7 +1870,7 @@ def edit_distance(input,
                   ignored_tokens=None,
                   name=None):
     """
-    EditDistance operator computes the edit distances between a batch of hypothesis strings and their references.Edit distance, also called Levenshtein distance, measures how dissimilar two strings are by counting the minimum number of operations to transform one string into anthor. Here the operations include insertion, deletion, and substitution. For example, given hypothesis string A = "kitten" and reference B = "sitting", the edit distance is 3 for A will be transformed into B at least after two substitutions and one insertion:
+    EditDistance operator computes the edit distances between a batch of hypothesis strings and their references. Edit distance, also called Levenshtein distance, measures how dissimilar two strings are by counting the minimum number of operations to transform one string into anthor. Here the operations include insertion, deletion, and substitution. For example, given hypothesis string A = "kitten" and reference B = "sitting", the edit distance is 3 for A will be transformed into B at least after two substitutions and one insertion:
 
        "kitten" -> "sitten" -> "sittin" -> "sitting"
 
@@ -2028,7 +2028,7 @@ def warpctc(input, label, blank=0, norm_by_times=False, **kwargs):
          Temporal Classification (CTC) loss, which is in the
          half-opened interval [0, num_classes + 1).
        norm_by_times: (bool, default: false), whether to normalize
-       the gradients by the number of time-step,which is also the
+       the gradients by the number of time-step, which is also the
        sequence's length. There is no need to normalize the gradients
        if warpctc layer was follewed by a mean_op.
 

From c9e208c84593362656663f5e59f787b77ff44875 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 22 Jan 2018 19:19:06 +0800
Subject: [PATCH 46/54] Fix white space in comments.

---
 paddle/operators/im2sequence_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/im2sequence_op.cc b/paddle/operators/im2sequence_op.cc
index 1854fc384c..31baaedf69 100644
--- a/paddle/operators/im2sequence_op.cc
+++ b/paddle/operators/im2sequence_op.cc
@@ -56,12 +56,12 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
   Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(Tensor)The input tensor has NCHW format."
+             "(Tensor) The input tensor has NCHW format."
              "N: batch size"
              "C: channels"
              "H: height"
              "W: width");
-    AddOutput("Out", "(LodTensor)The output data of im2sequence op,");
+    AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
     AddAttr<std::vector<int>>("kernels",
                               "(vector<int>), the "
                               "kernels(kernel_height, kernel_width)");

From 58cd4fda625121af4b24f5e988c16a6b801a75ef Mon Sep 17 00:00:00 2001
From: ying <lcy.seso@gmail.com>
Date: Mon, 22 Jan 2018 18:22:50 +0800
Subject: [PATCH 47/54] add wrapper for transpose operator.

---
 python/paddle/v2/fluid/layers/nn.py | 99 +++++++++++++++++++++++------
 1 file changed, 81 insertions(+), 18 deletions(-)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index b1db16a83e..a6ff213935 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -22,13 +22,38 @@ from ..param_attr import ParamAttr
 from tensor import concat
 
 __all__ = [
-    'fc', 'embedding', 'dynamic_lstm', 'gru_unit', 'linear_chain_crf',
-    'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
-    'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
-    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
-    'lstm_unit', 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min',
-    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
-    'l2_normalize', 'matmul', 'warpctc', 'sequence_reshape'
+    'fc',
+    'embedding',
+    'dynamic_lstm',
+    'gru_unit',
+    'linear_chain_crf',
+    'crf_decoding',
+    'cos_sim',
+    'cross_entropy',
+    'square_error_cost',
+    'accuracy',
+    'chunk_eval',
+    'sequence_conv',
+    'conv2d',
+    'sequence_pool',
+    'pool2d',
+    'batch_norm',
+    'beam_search_decode',
+    'conv2d_transpose',
+    'sequence_expand',
+    'lstm_unit',
+    'reduce_sum',
+    'reduce_mean',
+    'reduce_max',
+    'reduce_min',
+    'sequence_first_step',
+    'sequence_last_step',
+    'dropout',
+    'split',
+    'l2_normalize',
+    'matmul',
+    'warpctc',
+    'sequence_reshape',
 ]
 
 
@@ -43,14 +68,14 @@ def fc(input,
     **Fully Connected Layer**
 
     The fully connected layer can take multiple tensors as its inputs. It
-    creates a variable (one for each input tensor) called weights for each input
-    tensor, which represents a fully connected weight matrix from each input
-    unit to each output unit. The fully connected layer multiplies each input
-    tensor with its coresponding weight to produce an output Tensor. If
-    multiple input tensors are given, the results of multiple multiplications
-    will be sumed up. If bias_attr is not None, a biases variable will be
-    created and added to the output. Finally, if activation is not None,
-    it will be applied to the output as well.
+    creates a variable (one for each input tensor) called weights for each
+    input tensor, which represents a fully connected weight matrix from
+    each input unit to each output unit. The fully connected layer
+    multiplies each input tensor with its coresponding weight to produce
+    an output Tensor. If multiple input tensors are given, the results of
+    multiple multiplications will be sumed up. If bias_attr is not None,
+    a biases variable will be created and added to the output. Finally,
+    if activation is not None, it will be applied to the output as well.
 
     This process can be formulated as follows:
 
@@ -1813,11 +1838,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
 
       - If both are 2-D, they are multiplied like conventional matrices.
       - If either is n-D, it is treated as a stack of matrices residing in the
-        last two dimensions and a batched matrix multiply supporting broadcast 
+        last two dimensions and a batched matrix multiply supporting broadcast
         applies on the two tensors.
 
-    Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and 
-    nontransposed, the prepended or appended dimension :math:`1` will be 
+    Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
+    nontransposed, the prepended or appended dimension :math:`1` will be
     removed after matrix multiplication.
 
     Args:
@@ -1971,3 +1996,41 @@ def sequence_reshape(input, new_dim):
         outputs={'Out': [out]},
         attrs={'new_dim': new_dim})
     return out
+
+
+def transpose(input, perm, name=None):
+    """
+    **transpose Layer**
+
+    Permute the dimensions of `input` according to `perm`.
+
+    The `i`-th dimension  of the returned tensor will correspond to the
+    perm[i]-th dimension of `input`.
+
+    Args:
+       input (Variable): (Tensor), A Tensor.
+       perm (list): A permutation of the dimensions of `input`.
+
+    Returns:
+        Variable: A transposed Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32')
+            x_transposed = layers.transpose(input=x, perm=[1, 0, 2])
+    """
+
+    if len(perm) != len(input.shape):
+        raise ValueError(
+            "Input(perm) is the permutation of dimensions of Input(input). "
+            "It's length shoud be equal to Input(input)'s rank.")
+
+    helper = LayerHelper('transpose', **locals())
+    out = helper.create_tmp_variable(helper.input_dtype())
+    helper.append_op(
+        type='transpose',
+        inputs={'X': [input]},
+        outputs={'Out': [out]},
+        attrs={'axis': perm})
+    return out

From 530df1b2891413839989e2c6568f31fd9e1a1f98 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Mon, 22 Jan 2018 21:28:10 +0800
Subject: [PATCH 48/54] Fix the naming.

---
 paddle/operators/bipartite_match_op.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc
index c2d30c7d92..0fcff6e26d 100644
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
@@ -28,11 +28,11 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("DisMat"),
-                   "Input(DisMat) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
+                   "Input(DistMat) of BipartiteMatch should not be null.");
 
-    auto dims = ctx->GetInputDim("DisMat");
-    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DisMat) must be 2.");
+    auto dims = ctx->GetInputDim("DistMat");
+    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
 
     ctx->SetOutputDim("ColToRowMatchIndices", dims);
     ctx->SetOutputDim("ColToRowMatchDis", dims);
@@ -90,7 +90,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* dist_mat = context.Input<LoDTensor>("DisMat");
+    auto* dist_mat = context.Input<LoDTensor>("DistMat");
     auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
     auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
 
@@ -132,12 +132,12 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
   BipartiteMatchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
-        "DisMat",
+        "DistMat",
         "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
         "[K, M]. It is pair-wise distance matrix between the entities "
         "represented by each row and each column. For example, assumed one "
         "entity is A with shape [K], another entity is B with shape [M]. The "
-        "DisMat[i][j] is the distance between A[i] and B[j]. The bigger "
+        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
         "the distance is, the better macthing the pairs are. Please note, "
         "This tensor can contain LoD information to represent a batch of "
         "inputs. One instance of this batch can contain different numbers of "
@@ -155,7 +155,7 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
               "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
               "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
               "instance are called LoD. Then "
-              "ColToRowMatchDis[i][j] = DisMat[d+LoD[i]][j]");
+              "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
     AddComment(R"DOC(
 This operator is a greedy bipartite matching algorithm, which is used to
 obtain the matching with the maximum distance based on the input
@@ -171,7 +171,7 @@ row entity to the column entity and the matched indices are not duplicated
 in each row of ColToRowMatchIndices. If the column entity is not matched
 any row entity, set -1 in ColToRowMatchIndices.
 
-Please note that the input DisMat can be LoDTensor (with LoD) or Tensor.
+Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
 If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
 If Tensor, the height of ColToRowMatchIndices is 1.
 

From e44dedf90f6b0563d6a36f58f10448e29d2ac552 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Mon, 22 Jan 2018 23:54:25 +0800
Subject: [PATCH 49/54] Fix the warning and unit test.

---
 paddle/operators/bipartite_match_op.cc                  | 2 +-
 python/paddle/v2/fluid/tests/test_bipartite_match_op.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc
index 0fcff6e26d..b0f7376d27 100644
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
@@ -62,7 +62,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
         if (match_indices[j] != -1) {
           continue;
         }
-        for (int k = 0; k < row_pool.size(); ++k) {
+        for (size_t k = 0; k < row_pool.size(); ++k) {
           int m = row_pool[k];
           // distance is 0 between m-th row and j-th column
           if (dist_data[m * col + j] < kEPS) {
diff --git a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
index 8f1db35d3c..34101b1da4 100644
--- a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
+++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
@@ -69,7 +69,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest):
         dis = np.random.random((23, 217)).astype('float32')
         match_indices, match_dis = batch_bipartite_match(dis, lod[0])
 
-        self.inputs = {'DisMat': (dis, lod)}
+        self.inputs = {'DistMat': (dis, lod)}
         self.outputs = {
             'ColToRowMatchIndices': (match_indices),
             'ColToRowMatchDis': (match_dis),
@@ -86,7 +86,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
         dis = np.random.random((8, 17)).astype('float32')
         match_indices, match_dis = batch_bipartite_match(dis, lod[0])
 
-        self.inputs = {'DisMat': dis}
+        self.inputs = {'DistMat': dis}
         self.outputs = {
             'ColToRowMatchIndices': (match_indices),
             'ColToRowMatchDis': (match_dis),

From 6ae46a29c21375503af1ff1331b83b4d37d505c4 Mon Sep 17 00:00:00 2001
From: kavyasrinet <kavyasrinet@baidu.com>
Date: Mon, 22 Jan 2018 12:12:39 -0800
Subject: [PATCH 50/54] Updating the cluster trainign doc (#7746)

---
 .../usage/cluster/fluid_cluster_train_en.md   | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/usage/cluster/fluid_cluster_train_en.md
index a64004a7c4..11904a6f71 100644
--- a/doc/howto/usage/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/usage/cluster/fluid_cluster_train_en.md
@@ -2,27 +2,27 @@
 
 ## Introduction
 
-In this article, we'll explain how to config and run distributed training jobs with PaddlePaddle Fluid in a bare metal cluster.
+In this article, we'll explain how to configure and run distributed training jobs with PaddlePaddle Fluid in a bare metal cluster.
 
 ## Preparations
 
-### Get your cluster ready
+### Getting the cluster ready
 
-Prepare your computer nodes in the cluster. Nodes in this cluster can be of any specification that runs PaddlePaddle, and with a unique IP address assigned to it. Make sure they can communicate with each other.
+Prepare the compute nodes in the cluster. Nodes in this cluster can be of any specification that runs PaddlePaddle, and with a unique IP address assigned to it. Make sure they can communicate to each other.
 
 ### Have PaddlePaddle installed
 
 PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes, be sure to properly install drivers and CUDA libraries.
 
-PaddlePaddle build and installation guide can be found from [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
+PaddlePaddle build and installation guide can be found  [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
 
-### Update training script
+### Update the training script
 
 #### Non-cluster training script
 
 Let's take [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)'s first chapter: "fit a line" as an example.
 
-This demo's non-cluster version with fluid API is as follows:
+The non-cluster version of this demo with fluid API is as follows:
 
 ``` python
 import paddle.v2 as paddle
@@ -65,25 +65,25 @@ for pass_id in range(PASS_NUM):
 exit(1)
 ```
 
-We created a simple fully connected neural networks training program and handed it to the fluid executor to run for 100 passes.
+We created a simple fully-connected neural network training program and handed it to the fluid executor to run for 100 passes.
 
-Now let's try to convert it to a distributed version to run in a cluster.
+Now let's try to convert it to a distributed version to run on a cluster.
 
 #### Introducing parameter server
 
-As you see from the non-cluster version of training script, there is only one role in it: the trainer, who does the computing as well as holding parameters. In cluster training, since multi-trainers are working on the same task, they need one centralized place to hold and distribute parameters. This centralized place is called the Parameter Server in PaddlePaddle.
+As we can see from the non-cluster version of training script, there is only one role in the script: the trainer, that performs the computing as well as holds the parameters. In cluster training, since multi-trainers are working on the same task, they need one centralized place to hold and distribute parameters. This centralized place is called the Parameter Server in PaddlePaddle.
 
-![parameter server architect](src/trainer.png)
+![parameter server architecture](src/trainer.png)
 
-Parameter Server in fluid does not only hold parameters but is also assigned with a part of the program. Trainers communicate with parameter servers via send/receive OPs. For more tech detail, please refer to this [document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/dist_refactor/distributed_architecture.md).
+Parameter Server in fluid not only holds the parameters but is also assigned with a part of the program. Trainers communicate with parameter servers via send/receive OPs. For more technical details, please refer to  [this document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/dist_refactor/distributed_architecture.md).
 
-Now we need to create program for both trainers and parameter servers, the question is how?
+Now we need to create programs for both: trainers and parameter servers, the question is how?
 
 #### Slice the program
 
-Fluid provides a tool called "Distribute Transpiler" to automatically convert the non-cluster program into cluster program.
+Fluid provides a tool called "Distributed Transpiler" that automatically converts the non-cluster program into cluster program.
 
-The idea behind this tool is to find optimize OPs and gradient parameters, slice the program into 2 pieces and connect them with send/receive OP.
+The idea behind this tool is to find the optimize OPs and gradient parameters, slice the program into 2 pieces and connect them with send/receive OP.
 
 Optimize OPs and gradient parameters can be found from the return values of optimizer's minimize function.
 
@@ -94,9 +94,9 @@ To put them together:
 
 optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
 
-t = fluid.DistributeTranspiler() # create transpiler instance
+t = fluid.DistributeTranspiler() # create the transpiler instance
 # slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
-t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) 
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
 
 ... #create executor
 
@@ -119,7 +119,7 @@ for pass_id in range(100):
 
 ### E2E demo
 
-Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). In parameter server node run this in the command line:
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). In parameter server node run the following in the command line:
 
 ``` bash
 PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
@@ -129,12 +129,12 @@ PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER
 
 Wait until the prompt `Server listening on 192.168.1.2:6174`
 
-Then in 2 of your trainer node run this:
+Then in 2 of your trainer nodes run this:
 
 ``` bash
 PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=TRAINER python notest_dist_fit_a_line.py
 ```
 
-*the reason you need to run this command twice in 2 nodes is: in the script we set the trainer count to be 2. You can change this setting on line 50*
+*the reason you need to run this command twice in 2 nodes is because: in the script we set the trainer count to be 2. You can change this setting on line 50*
 
 Now you have 2 trainers and 1 parameter server up and running.

From b7eeef2489933ba697fd368a6a9c41a361d01bbc Mon Sep 17 00:00:00 2001
From: kavyasrinet <kavyasrinet@baidu.com>
Date: Mon, 22 Jan 2018 14:33:41 -0800
Subject: [PATCH 51/54] Updating the comments for send_op and recv_op. (#7747)

* Updating the cluster trainign doc

* Fixed comments

* Updating few comments in recv_op
---
 paddle/operators/recv_op.cc | 16 ++++++++--------
 paddle/operators/send_op.cc |  6 +++---
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index 6036080735..593c35879a 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -49,7 +49,7 @@ static void CreateTensorFromMessageType(framework::Variable *var,
     var->GetMutable<framework::SelectedRows>();
   } else {
     PADDLE_THROW(
-        "VraibleMessage type %d is not in "
+        "VariableMessage type %d is not in "
         "[LoDTensor, SelectedRows]",
         var_type);
   }
@@ -121,17 +121,17 @@ class RecvOp : public framework::OperatorBase {
         if (it != grad_list.end()) {
           param_var_name = param_list[it - grad_list.begin()];
         } else {
-          LOG(ERROR) << "grad have no paired param:" << grad_var_name;
+          LOG(ERROR) << "grad has no paired param:" << grad_var_name;
         }
-        VLOG(3) << "recved grad: " << grad_var_name
+        VLOG(3) << "received grad: " << grad_var_name
                 << " updating param: " << param_var_name;
         if (fan_in > 1) {
           grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
         }
         auto *var = recv_scope.FindVar(grad_var_name);
         if (var == nullptr) {
-          LOG(ERROR) << "can not find server side var: " << grad_var_name;
-          PADDLE_THROW("can not find server side var");
+          LOG(ERROR) << "Can not find server side var: " << grad_var_name;
+          PADDLE_THROW("Can not find server side var");
         }
         detail::DeserializeFromMessage(v.second, dev_ctx, var);
       }
@@ -165,7 +165,7 @@ class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Recv operator
 
-This operator will recv tensor from send_op
+This operator will recieve tensor from send_op
 )DOC");
     AddAttr<std::string>("endpoint",
                          "(string, default 127.0.0.1:6164)"
@@ -176,11 +176,11 @@ This operator will recv tensor from send_op
         kOptimizeBlock, "Serialized ProgramDesc string for recv to run.");
     AddAttr<std::vector<std::string>>(
         "ParamList", "type list of string",
-        "grad->param name mapping to find which param to optimize.")
+        "grad->param name mapping to find which parameters to optimize.")
         .SetDefault({});
     AddAttr<std::vector<std::string>>(
         "GradList", "type list of string",
-        "grad->param name mapping to find which param to optimize.")
+        "grad->param name mapping to find which parameters to optimize.")
         .SetDefault({});
     AddAttr<int>("Fanin", "type int",
                  "Number of trainers in the current cluster job")
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
index 807533a6c6..5aa66c20ea 100644
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -62,13 +62,13 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SendOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable();
-    AddOutput("Out", "(Tensor) Output tensor to get from server")
+    AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
+    AddOutput("Out", "(Tensor) Output tensor to be received from server")
         .AsDuplicable();
     AddComment(R"DOC(
 Send operator
 
-This operator will send tensor to recv_op.
+This operator will send tensor to recv_op at the parameter server.
 )DOC");
     AddAttr<std::vector<std::string>>("endpoints",
                                       "(string vector, default 127.0.0.1:6164)"

From 23f5c1829ce0e2094fe4a9fa6d90c63e56181086 Mon Sep 17 00:00:00 2001
From: kavyasrinet <kavyasrinet@baidu.com>
Date: Mon, 22 Jan 2018 16:56:09 -0800
Subject: [PATCH 52/54] Fixed few comments in transpiler (#7748)

* Updating the cluster trainign doc

* Fixed few comments of transpiler

* Adding few explanations
---
 .../paddle/v2/fluid/distribute_transpiler.py  | 49 +++++++++++--------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
index 573774a232..abcad899bf 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -38,14 +38,14 @@ def split_dense_variable(var_list,
                          min_block_size=1024,
                          max_block_size=1048576):
     """
-        We may need to split dense tensor to one or several blocks and put
+        We may need to split dense tensor to one or more blocks and put
         them equally onto parameter server. One block is a sub-tensor
         aligned by dim[0] of the tensor.
-        
+
         We need to have a minimal block size so that the calculations in
         the parameter server side can gain better performance. By default
-        mininum block size is 1024. The max block size is used to prevent
-        too large block that may causing send error.
+        minimum block size is 1024. The max block size is used to prevent
+        very large blocks that may cause send error.
     """
     blocks = []
     for var in var_list:
@@ -64,7 +64,7 @@ def split_dense_variable(var_list,
             remains = block_size % dim1
             if remains != 0:
                 block_size += dim1 - remains
-        # update split_count after align
+        # update split_count after aligning
         split_count = int(math.ceil(var_numel / float(block_size)))
         for block_id in xrange(split_count):
             curr_block_size = min(block_size, var_numel - (
@@ -83,18 +83,18 @@ class DistributeTranspiler:
                   trainers=1,
                   split_method=round_robin):
         """
-            Transpile the program to a distributed data-parallelism programs.
-            The main_program will be transform to use a remote parameter server
+            Transpile the program to distributed data-parallelism programs.
+            The main_program will be transformed to use a remote parameter server
             to do parameter optimization. And the optimization graph will be put
-            in to a parameter server program.
+            into a parameter server program.
 
-            Use different methods to split trainable varialbles to different
+            Use different methods to split trainable variables to different
             parameter servers.
 
             :param optimize_ops: op list of optimization, should be the
                                  return value of Optimizer.minimize
             :type optimize_ops: list
-            :param program: program to optimize, default default_main_program
+            :param program: program to optimize, default is default_main_program
             :param pservers: parameter server endpoints like "m1:6174,m2:6174"
             :type pservers: string
             :return: return a list of programs
@@ -106,11 +106,11 @@ class DistributeTranspiler:
         self.trainers = trainers
         self.optimize_ops = optimize_ops
         # steps to transpile:
-        # 1. split variable to multiple blocks, align by product(dim[1:]) (width).
+        # 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
         # 2. modify trainer program add split_op to each Grad.
         # 3. append send_op to trainer.
         # 4. append concat_op to trainer to update local weights.
-        # 5. create new program as parameter server.
+        # 5. create new program for parameter server.
         # 6. create parameter server program by split_method generated endpoint->VarBlock
 
         pserver_endpoints = pservers.split(",")
@@ -136,10 +136,10 @@ class DistributeTranspiler:
         for b in param_blocks:
             varname, block_id, _ = b.split(":")
             send_outputs.append(param_var_mapping[varname][int(block_id)])
-        # let send_op know which endpoint to send which var, eplist is of the same
-        # order of send_inputs.
+        # let send_op know which endpoint to send which var to, eplist has the same
+        # order as send_inputs.
         eplist = split_method(send_inputs, pserver_endpoints)
-        # create mapping of endpoint -> splited var to create pserver side program
+        # create mapping of endpoint -> split var to create pserver side program
         self.param_grad_ep_mapping = dict()
         for i, ep in enumerate(eplist):
             param = send_outputs[i]
@@ -149,6 +149,7 @@ class DistributeTranspiler:
             self.param_grad_ep_mapping[ep]["params"].append(param)
             self.param_grad_ep_mapping[ep]["grads"].append(grad)
 
+        # create send_op
         send_op = program.global_block().append_op(
             type="send",
             inputs={"X": send_inputs},
@@ -167,6 +168,7 @@ class DistributeTranspiler:
                 attrs={"axis": 0})
 
     def _create_vars_from_blocklist(self, program, block_list):
+        # Create respective variables using the block_list
         block_map = dict()
         var_mapping = dict()
         for block_str in block_list:
@@ -207,11 +209,12 @@ class DistributeTranspiler:
             dtype=var.dtype,
             type=var.type,
             lod_level=var.lod_level,
-            # HACK: let all param in pserver persistable so child
+            # HACK: let all param in pserver be persistable so the child
             # program in recv can get them
             persistable=True)
 
     def _append_split_op(self, program, gradblocks):
+        # Split variables that need to be split and append respective ops
         var_mapping = self._create_vars_from_blocklist(program, gradblocks)
         for varname, splited_vars in var_mapping.iteritems():
             # variable that don't need to split have empty splited_vars
@@ -248,6 +251,7 @@ class DistributeTranspiler:
         return self.program
 
     def _create_var_for_trainers(self, block, var, trainers):
+        # For each trainer, create the necessary variables
         var_list = []
         for i in xrange(trainers):
             var_each = block.create_var(
@@ -262,7 +266,7 @@ class DistributeTranspiler:
                                    param_shape):
         """
         Returns the shape for optimizer inputs that need to be reshaped when
-        Param and Grad is splited to multiple servers.
+        Param and Grad is split to multiple servers.
         """
         # HACK(typhoonzero): Should use functions of corresponding optimizer in
         # optimizer.py to get the shape, do not  bind this in the transpiler.
@@ -300,7 +304,7 @@ class DistributeTranspiler:
             else:
                 for n in param_names:
                     if n.startswith(op.inputs["Param"].name+".block") and \
-                        n != op.inputs["Param"].name:
+                       n != op.inputs["Param"].name:
                         return True
                 return False
         else:
@@ -396,7 +400,7 @@ class DistributeTranspiler:
                 dtype=var.dtype,
                 shape=new_shape)
 
-        # change outputs ParamOut variable
+        # change output's ParamOut variable
         opt_op.outputs["ParamOut"] = new_inputs["Param"]
         program.global_block().append_op(
             type=opt_op.type,
@@ -405,6 +409,7 @@ class DistributeTranspiler:
             attrs=opt_op.attrs)
 
     def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
+        # Append the ops for parameters that do not need to be optimized/updated
         for _, var in opt_op.inputs.iteritems():
             program.global_block().create_var(
                 name=var.name,
@@ -424,7 +429,7 @@ class DistributeTranspiler:
 
     def get_pserver_program(self, endpoint):
         """
-        get pserver side program by endpoint
+        Get pserver side program using the endpoint
 
         NOTE: assume blocks of the same variable is not distributed
         on the same pserver, only change param/grad varnames for
@@ -450,6 +455,7 @@ class DistributeTranspiler:
                     shape=v.shape)
         # step6
         optimize_sub_program = Program()
+        # Iterate through the ops and append ops as needed
         for idx, opt_op in enumerate(self.optimize_ops):
             is_op_on_pserver = self._is_op_on_pserver(endpoint,
                                                       self.optimize_ops, idx)
@@ -461,6 +467,7 @@ class DistributeTranspiler:
             else:
                 self._append_pserver_non_opt_ops(optimize_sub_program,
                                                  pserver_program, opt_op)
+        # Append the recv op
         pserver_program.global_block().append_op(
             type="recv",
             inputs={"RX": self.param_grad_ep_mapping[endpoint]["grads"]
@@ -486,7 +493,7 @@ class DistributeTranspiler:
         """
         Get startup program for current parameter server.
         Modify operator input variables if there are variables that
-        was splited to several blocks.
+        were split to several blocks.
         """
         s_prog = Program()
         orig_s_prog = framework.default_startup_program()

From dcb5a1ed6747a9168ecf55c8b5a39afc90a1caf0 Mon Sep 17 00:00:00 2001
From: ying <lcy.seso@gmail.com>
Date: Tue, 23 Jan 2018 08:52:47 +0800
Subject: [PATCH 53/54] fix ci.

---
 python/paddle/v2/dataset/wmt16.py              | 17 +++++++++--------
 python/paddle/v2/fluid/layers/nn.py            | 10 +++++-----
 python/paddle/v2/fluid/layers/ops.py           | 18 ++++++++++++++----
 .../book/test_understand_sentiment_lstm.py     |  4 ++--
 4 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py
index bbc28a2da9..e2f463be2f 100644
--- a/python/paddle/v2/dataset/wmt16.py
+++ b/python/paddle/v2/dataset/wmt16.py
@@ -171,8 +171,9 @@ def train(src_dict_size, trg_dict_size, src_lang="en"):
         callable: The train reader.
     """
 
-    assert (src_lang in ["en", "de"], ("An error language type.  Only support: "
-                                       "en (for English); de(for Germany)"))
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type.  Only support: "
+                         "en (for English); de(for Germany).")
     src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
                                                    src_lang)
 
@@ -218,9 +219,9 @@ def test(src_dict_size, trg_dict_size, src_lang="en"):
         callable: The test reader.
     """
 
-    assert (src_lang in ["en", "de"],
-            ("An error language type.  "
-             "Only support: en (for English); de(for Germany)"))
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
 
     src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
                                                    src_lang)
@@ -266,9 +267,9 @@ def validation(src_dict_size, trg_dict_size, src_lang="en"):
     Returns:
         callable: The validation reader.
     """
-    assert (src_lang in ["en", "de"],
-            ("An error language type.  "
-             "Only support: en (for English); de(for Germany)"))
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
     src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
                                                    src_lang)
 
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 7b3ad707cc..a01ccfa635 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -2141,7 +2141,7 @@ def sequence_reshape(input, new_dim):
     return out
 
 
-def transpose(input, perm, name=None):
+def transpose(x, perm, name=None):
     """
     **transpose Layer**
 
@@ -2161,19 +2161,19 @@ def transpose(input, perm, name=None):
         .. code-block:: python
 
             x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32')
-            x_transposed = layers.transpose(input=x, perm=[1, 0, 2])
+            x_transposed = layers.transpose(x, perm=[1, 0, 2])
     """
 
-    if len(perm) != len(input.shape):
+    if len(perm) != len(x.shape):
         raise ValueError(
             "Input(perm) is the permutation of dimensions of Input(input). "
             "It's length shoud be equal to Input(input)'s rank.")
 
     helper = LayerHelper('transpose', **locals())
-    out = helper.create_tmp_variable(helper.input_dtype())
+    out = helper.create_tmp_variable(x.dtype)
     helper.append_op(
         type='transpose',
-        inputs={'X': [input]},
+        inputs={'X': [x]},
         outputs={'Out': [out]},
         attrs={'axis': perm})
     return out
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index b517f8be6a..022a94cad4 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -45,10 +45,20 @@ __activations__ = [
 ]
 
 __all__ = [
-    'mean', 'mul', 'reshape', 'scale', 'transpose',
-    'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div',
-    'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min',
-    'clip', 'clip_by_norm', 'sequence_softmax'
+    'mean',
+    'mul',
+    'reshape',
+    'scale',
+    'sigmoid_cross_entropy_with_logits',
+    'elementwise_add',
+    'elementwise_div',
+    'elementwise_sub',
+    'elementwise_mul',
+    'elementwise_max',
+    'elementwise_min',
+    'clip',
+    'clip_by_norm',
+    'sequence_softmax',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
index 6181914241..117f74c59a 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -65,13 +65,13 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
 
     emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
     emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
-    emb = fluid.layers.transpose(x=emb, axis=[1, 0, 2])
+    emb = fluid.layers.transpose(x=emb, perm=[1, 0, 2])
 
     c_pre_init = fluid.layers.fill_constant(
         dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
     c_pre_init.stop_gradient = False
     layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
-    layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2])
+    layer_1_out = fluid.layers.transpose(x=layer_1_out, perm=[1, 0, 2])
 
     prediction = fluid.layers.fc(input=layer_1_out,
                                  size=class_dim,

From 9b1a17a8356ce2c9180500eabc4fa6e8b2ac7849 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Tue, 23 Jan 2018 10:20:57 +0800
Subject: [PATCH 54/54] Refine conv2d_transpose layer doc (#6920)

* refine conv2d_transpose layer doc

* fix conv2d_transpose doc

* fix doc
---
 paddle/operators/conv_transpose_op.cc |  10 +-
 paddle/operators/conv_transpose_op.h  |   9 +-
 python/paddle/v2/fluid/layers/nn.py   | 175 ++++++++++++++++----------
 3 files changed, 121 insertions(+), 73 deletions(-)

diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index a2382a7e42..089290a506 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -160,8 +160,8 @@ Example:
        Output shape: $(N, C_{out}, H_{out}, W_{out})$
   Where
   $$
-       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\
-       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f
+       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\
+       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
   $$
 )DOC");
 }
@@ -249,9 +249,9 @@ Example:
        Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
   Where
   $$
-       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\
-       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\
-       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f
+       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\
+       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\
+       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
   $$
 )DOC");
 }
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index a42ade41b1..8c0d57afcd 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -141,9 +141,9 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
       if (data_dim == 2U) {
         // col2im: col_matrix -> dy
         // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
-        col2im(dev_ctx, col, std::vector<int>{dilations[0], dilations[1]},
-               strides, std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                         paddings[1]},
+        col2im(dev_ctx, col, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
                &output_batch);
       } else if (data_dim == 3U) {
         // col2vol: col_matrix -> dy
@@ -247,8 +247,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
         if (data_dim == 2U) {
           // im2col: dy -> col matrix
           // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
-          im2col(dev_ctx, output_grad_batch,
-                 std::vector<int>{dilations[0], dilations[1]}, strides,
+          im2col(dev_ctx, output_grad_batch, dilations, strides,
                  std::vector<int>{paddings[0], paddings[1], paddings[0],
                                   paddings[1]},
                  &col);
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index a01ccfa635..0721198816 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -790,8 +790,8 @@ def conv2d(input,
     <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
     If bias attribution and activation type are provided, bias is added to the output of the convolution,
     and the corresponding activation function is applied to the final result.
-    For each input :math:`X`, the equation is:
 
+    For each input :math:`X`, the equation is:
 
     .. math::
 
@@ -799,51 +799,54 @@ def conv2d(input,
 
     In the above equation:
 
-        * :math:`X`: Input value, a tensor with NCHW format.
-        * :math:`W`: Filter value, a tensor with MCHW format.
-        * :math:`\\ast`: Convolution operation.
-        * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-        * :math:`\\sigma`: Activation function.
-        * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
-        Input:
-            Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        - Input:
+
+          Input shape: $(N, C_{in}, H_{in}, W_{in})$
 
-            Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+          Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+
+        - Output:
+          Output shape: $(N, C_{out}, H_{out}, W_{out})$
 
-        Output:
-            Output shape: $(N, C_{out}, H_{out}, W_{out})$
         Where
-    .. math::
+
+        .. math::
 
         H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
         W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
-        input(Variable): The input image with [N, C, H, W] format.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        padding(int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding. Default: padding = 0.
-        groups(int): The groups number of the Conv2d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1
-        param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
-        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
-        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act(str): Activation type. Default: None
+       input(Variable): The input image with [N, C, H, W] format.
+       num_filters(int): The number of filter. It is as same as the output
+           image channel.
+       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+           it must contain two integers, (filter_size_H, filter_size_W).
+           Otherwise, the filter will be a square.
+       stride(int|tuple): The stride size. If stride is a tuple, it must
+           contain two integers, (stride_H, stride_W). Otherwise, the
+           stride_H = stride_W = stride. Default: stride = 1.
+       padding(int|tuple): The padding size. If padding is a tuple, it must
+           contain two integers, (padding_H, padding_W). Otherwise, the
+           padding_H = padding_W = padding. Default: padding = 0.
+       groups(int): The groups number of the Conv2d Layer. According to grouped
+           convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+           the first half of the filters is only connected to the first half
+           of the input channels, while the second half of the filters is only
+           connected to the second half of the input channels. Default: groups=1
+       param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
+       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+           library is installed. Default: True
+       act(str): Activation type. Default: None
 
     Returns:
         Variable: The tensor variable storing the convolution and \
@@ -858,7 +861,6 @@ def conv2d(input,
           data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
           conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
     """
-
     if stride is None:
         stride = [1, 1]
     helper = LayerHelper('conv2d', **locals())
@@ -1212,38 +1214,85 @@ def conv2d_transpose(input,
                      use_cudnn=True,
                      name=None):
     """
-    The transpose of conv2d layer.
+    **Convlution2D transpose layer**
+
+    The convolution2D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCHW format. Where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+    Parameters(dilations, strides, paddings) are two elements. These two elements
+    represent height and width, respectively. The details of convolution transpose
+    layer, please refer to the following explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = W \\ast X
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast` : Convolution transpose operation.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
 
-    This layer is also known as deconvolution layer.
+        - Input:
+
+          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+
+          Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
+
+        - Output:
+
+          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+
+        Where
+
+        .. math::
+
+           H_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
+           W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
 
     Args:
-        input(Variable): The input image with [N, C, H, W] format.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        output_size(int|tuple|None): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). This
-            parameter only works when filter_size is None.
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.  None if use output size to
-            calculate filter_size
-        padding(int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding.
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride.
-        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation.
-        param_attr: Parameter Attribute.
-        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
+       input(Variable): The input image with [N, C, H, W] format.
+       num_filters(int): The number of the filter. It is as same as the output
+           image channel.
+       output_size(int|tuple|None): The output image size. If output size is a
+           tuple, it must contain two integers, (image_H, image_W). This
+           parameter only works when filter_size is None.
+       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+           it must contain two integers, (filter_size_H, filter_size_W).
+           Otherwise, the filter will be a square. None if use output size to
+           calculate filter_size.
+       padding(int|tuple): The padding size. If padding is a tuple, it must
+           contain two integers, (padding_H, padding_W). Otherwise, the
+           padding_H = padding_W = padding. Default: padding = 0.
+       stride(int|tuple): The stride size. If stride is a tuple, it must
+           contain two integers, (stride_H, stride_W). Otherwise, the
+           stride_H = stride_W = stride. Default: stride = 1.
+       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+           contain two integers, (dilation_H, dilation_W). Otherwise, the
+           dilation_H = dilation_W = dilation. Default: dilation = 1.
+       param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. Default: None
+       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+           library is installed. Default: True
+       name(str|None): A name for this layer(optional). If set None, the layer
+           will be named automatically.
 
     Returns:
-        Variable: Output image.
+       Variable: The tensor variable storing the convolution transpose result.
+
+    Raises:
+       ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
+
+    Examples:
+       .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
     """
     helper = LayerHelper("conv2d_transpose", **locals())
     if not isinstance(input, Variable):