From 48556ba3bbf228cbe418c3a0634df9f7c147b211 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 11 Oct 2017 12:39:53 +0000
Subject: [PATCH 01/18] add block_expand_op

---
 paddle/operators/block_expand_op.cc | 80 ++++++++++++++++++++++++++
 paddle/operators/block_expand_op.cu |  0
 paddle/operators/block_expand_op.h  | 89 +++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+)
 create mode 100644 paddle/operators/block_expand_op.cc
 create mode 100644 paddle/operators/block_expand_op.cu
 create mode 100644 paddle/operators/block_expand_op.h

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
new file mode 100644
index 0000000000..0b36dc1ae5
--- /dev/null
+++ b/paddle/operators/block_expand_op.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/block_expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BlockExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("block"),
+                   "Input(block) of BlockExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("padding"),
+                   "Input(padding) of BlockExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("stride"),
+                   "Input(stride) of BlockExpandOp should not be null.");
+    // ctx->SetOutputDim("Out", {1});
+  }
+};
+
+class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BlockExpandOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("block", "The input of block_expand op");
+    AddOutput("stride", "The output of block_expand op");
+    AddComment(R"DOC(
+Expand feature map to minibatch matrix.
+- matrix width is: blockH_ * blockW_ * channels_
+- matirx height is: outputH_ * outputW_
+
+outputH\_ = 1 + (2paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
+            strideH\_ \\
+outputW\_ = 1 + (2paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
+            strideW\_
+
+The expand method is the same with ExpandConvLayer, but saved the transposed
+value. After expanding, output_.sequenceStartPositions will store timeline.
+The number of time steps are outputH_outputW_ and the dimension of each
+time step is blockH_ * blockW_ * channels_. This layer can be used after
+convolution neural network, and before recurrent neural network.
+)DOC");
+  }
+};
+
+class BlockExpandGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(block_expand, ops::BlockExpandOp, ops::BlockExpandOpMaker,
+            block_expand_grad, ops::BlockExpandOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    block_expand, ops::BlockExpanddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    block_expand_grad,
+    ops::BlockExpandGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/block_expand_op.cu b/paddle/operators/block_expand_op.cu
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
new file mode 100644
index 0000000000..54a9c5354f
--- /dev/null
+++ b/paddle/operators/block_expand_op.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/math_function.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class BlockExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using namespace framework;
+    const Tensor* input = context.Input<Tensor>("input");
+    const Tensor* filter = context.Input<Tensor>("filter");
+    const Tensor* stride = context.Input<Tensor>("stride");
+    const Tensor* padding = context.Input<Tensor>("padding");
+    Tensor* out = context.Output<Tensor>("Out");
+
+    auto input_dim = input->dims();
+    size_t N = input_dim[0];
+    size_t C = input_dim[1];
+    PADDLE_ENFORCE_GE(N, 1, "Input batchsize must >= 1.");
+    PADDLE_ENFORCE_EQ(input_dim.size(), 4, "Input format  must be NCHW.");
+
+    size_t input_height = input_dim[2];
+    size_t input_height = input_dim[3];
+
+    size_t filter_height = filter[0];
+    size_t filter_width = filter[1];
+
+    size_t output_height = 1 +
+                           (input_height + 2 * padding_height - block_height() +
+                            stride_height - 1) /
+                               stride_height;
+
+    size_t output_width =
+        1 +
+        (input_width + 2 * padding_width - block_width() + stride_width - 1) /
+            stride_width;
+
+    Tensor col;
+    if (clo_format = KCFO) {
+      col.Resize(
+          {N, C, filter_height, filter_width, output_height, output_width});
+    } else {
+      col.Resize(
+          {N, output_height, output_width, C, filter_height, filter_width});
+    }
+
+    for (size_t i = 0; i < N; i++) {
+      Im2ColFunctor<col_format, place, T>(ctx, one_img, col, stride[0],
+                                          stride[1], padding[0], padding[1]);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class BlockExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    /*
+  int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
+  int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
+  const Tensor* x = ctx.Input<Tensor>("X");
+  const Tensor* y = ctx.Input<Tensor>("Y");
+  */
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

From d2fda53217bf7c5370446f9a404b711ace9df130 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 12 Oct 2017 09:34:28 +0000
Subject: [PATCH 02/18] add expand comment

---
 paddle/operators/block_expand_op.cc | 40 +++++++++++++++++-----
 paddle/operators/block_expand_op.h  | 52 ++++++++++++++---------------
 2 files changed, 58 insertions(+), 34 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index 0b36dc1ae5..69c5e02a65 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -23,12 +23,18 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("block"),
-                   "Input(block) of BlockExpandOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("padding"),
-                   "Input(padding) of BlockExpandOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("stride"),
-                   "Input(stride) of BlockExpandOp should not be null.");
+    using namespace framework;
+    PADDLE_ENFORCE(ctx->HasInput("input"),
+                   "Input of BlockExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of BlockExpandOp op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("input");
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
+    PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
+
+    ctx->ShareLoD("X", /*->*/ "Out");
+
     // ctx->SetOutputDim("Out", {1});
   }
 };
@@ -38,8 +44,26 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
   BlockExpandOpMaker(framework::OpProto* proto,
                      framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("block", "The input of block_expand op");
-    AddOutput("stride", "The output of block_expand op");
+    AddInput("input", "The input of block_expand op");
+    AddOutput("out", "The output of block_expand op");
+    AddAttr<int>("block_height",
+                 R"DOC(
+        )DOC");
+    AddAttr<int>("block_width",
+                 R"DOC(
+        )DOC");
+    AddAttr<int>("stride_height",
+                 R"DOC(
+        )DOC");
+    AddAttr<int>("stride_width",
+                 R"DOC(
+        )DOC");
+    AddAttr<int>("padding_height",
+                 R"DOC(
+        )DOC");
+    AddAttr<int>("padding_width",
+                 R"DOC(
+        )DOC");
     AddComment(R"DOC(
 Expand feature map to minibatch matrix.
 - matrix width is: blockH_ * blockW_ * channels_
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 54a9c5354f..c0521dbbad 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -25,34 +25,34 @@ namespace operators {
 template <typename Place, typename T>
 class BlockExpandKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
-    const Tensor* input = context.Input<Tensor>("input");
-    const Tensor* filter = context.Input<Tensor>("filter");
-    const Tensor* stride = context.Input<Tensor>("stride");
-    const Tensor* padding = context.Input<Tensor>("padding");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    auto input_dim = input->dims();
-    size_t N = input_dim[0];
-    size_t C = input_dim[1];
-    PADDLE_ENFORCE_GE(N, 1, "Input batchsize must >= 1.");
-    PADDLE_ENFORCE_EQ(input_dim.size(), 4, "Input format  must be NCHW.");
-
-    size_t input_height = input_dim[2];
-    size_t input_height = input_dim[3];
-
-    size_t filter_height = filter[0];
-    size_t filter_width = filter[1];
-
-    size_t output_height = 1 +
-                           (input_height + 2 * padding_height - block_height() +
-                            stride_height - 1) /
-                               stride_height;
-
-    size_t output_width =
+    const Tensor* in = ctx.Input<Tensor>("input");
+    Tensor* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dim = in->dims();
+    int N = in_dim[0];
+    int C = in_dim[1];
+
+    int in_height = in_dim[2];
+    int in_width = in_dim[3];
+
+    int block_height = ctx.Attr<int>("block_height");
+    int block_width = ctx.Attr<int>("block_width");
+    int stride_height = ctx.Attr<int>("stride_height");
+    int stride_width = ctx.Attr<int>("stride_width");
+    int padding_height = ctx.Attr<int>("padding_height");
+    int padding_width = ctx.Attr<int>("padding_width");
+
+    int output_height =
+        1 +
+        (in_height + 2 * padding_height - block_height + stride_height - 1) /
+            stride_height;
+
+    int output_width =
         1 +
-        (input_width + 2 * padding_width - block_width() + stride_width - 1) /
+        (in_width + 2 * padding_width - block_width + stride_width - 1) /
             stride_width;
 
     Tensor col;

From f1ca3f7e5ed13fc23acb2ce79f756e939e604031 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 16 Oct 2017 11:43:21 +0000
Subject: [PATCH 03/18] add block forward

---
 paddle/operators/block_expand_op.cc | 94 +++++++++++++++++------------
 paddle/operators/block_expand_op.h  | 81 +++++++++++++------------
 2 files changed, 98 insertions(+), 77 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index 69c5e02a65..ec46737400 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -24,18 +24,43 @@ class BlockExpandOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     using namespace framework;
-    PADDLE_ENFORCE(ctx->HasInput("input"),
+    PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input of BlockExpandOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of BlockExpandOp op should not be null.");
+                   "Output of BlockExpandOp op should not be null.");
 
-    auto in_dim = ctx->GetInputDim("input");
+    auto in_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
     PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
-    ctx->ShareLoD("X", /*->*/ "Out");
-
-    // ctx->SetOutputDim("Out", {1});
+    int blockHeight = ctx->Attrs().Get<int>("blockHeight");
+    int blockWidth = ctx->Attrs().Get<int>("blockWidth");
+    int strideHeight = ctx->Attrs().Get<int>("strideHeight");
+    int strideWidth = ctx->Attrs().Get<int>("strideWidth");
+    int paddingHeight = ctx->Attrs().Get<int>("paddingHeight");
+    int paddingWidth = ctx->Attrs().Get<int>("paddingWidth");
+
+    int N = in_dim[0];
+    int C = in_dim[1];
+    int imgHeight = in_dim[3];
+    int imgWidth = in_dim[4];
+
+    int outputHeight = 0;
+    int outputWidth = 0;
+
+    get_blockexpand_output_shape(imgHeight, imgWidth, blockHeight, blockWidth,
+                                 strideHeight, strideWidth, paddingHeight,
+                                 paddingWidth, outputHeight, outputWidth);
+
+    // The result of im2col is [outputHeight, outputWidth,
+    // inputChannels, filterHeight, filterWidth], and it is easy to
+    // reshape into [seqLength, stepSize], where seqLength is equal
+    // outputHeight * outputWidth, stepSize is equal
+    // input_channels * blockHeight * blockWidth
+    ctx->SetOutputDim(
+        "Out", {N, outputHeight, outputWidth, C, blockHeight, blockWidth});
+
+    // ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -44,41 +69,36 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
   BlockExpandOpMaker(framework::OpProto* proto,
                      framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "The input of block_expand op");
-    AddOutput("out", "The output of block_expand op");
-    AddAttr<int>("block_height",
-                 R"DOC(
-        )DOC");
-    AddAttr<int>("block_width",
-                 R"DOC(
-        )DOC");
-    AddAttr<int>("stride_height",
-                 R"DOC(
-        )DOC");
-    AddAttr<int>("stride_width",
-                 R"DOC(
-        )DOC");
-    AddAttr<int>("padding_height",
-                 R"DOC(
-        )DOC");
-    AddAttr<int>("padding_width",
-                 R"DOC(
-        )DOC");
+    AddInput("X", R"DOC(
+(Tensor)The input tensor has NCHW format.
+    N: batch size
+    C: channels
+    H: height
+    W: width
+)DOC");
+    AddOutput("Out", "(LodTensor)The output data of block_expand op,");
+    AddAttr<int>("blockHeight", "(int)height of block.");
+    AddAttr<int>("blockWidth", "(int)width of block.");
+    AddAttr<int>("strideHeight", "(int)height of stride.");
+    AddAttr<int>("strideWidth", "(int)width of stride.");
+    AddAttr<int>("paddingHeight", "(int)height of padding.");
+    AddAttr<int>("paddingWidth", "(int)width of padding.");
     AddComment(R"DOC(
 Expand feature map to minibatch matrix.
-- matrix width is: blockH_ * blockW_ * channels_
-- matirx height is: outputH_ * outputW_
+- matirx height is: outputHeight * outputWidth
+- matrix width is: blockHeight * blockWidth * channels
 
-outputH\_ = 1 + (2paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
-            strideH\_ \\
-outputW\_ = 1 + (2paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
-            strideW\_
+outputHeight = 
+    1 + (2 * paddingHeight + imgHeight - blockHeight + strideHeight - 1) /
+            strideHeight;
+outputWidth = 
+    1 + (2 * paddingWidth + imgWidth - blockWidth + strideWidth - 1) /
+            strideWidth;
 
 The expand method is the same with ExpandConvLayer, but saved the transposed
-value. After expanding, output_.sequenceStartPositions will store timeline.
-The number of time steps are outputH_outputW_ and the dimension of each
-time step is blockH_ * blockW_ * channels_. This layer can be used after
-convolution neural network, and before recurrent neural network.
+value. After expanding, The number of time steps are outputHeight * outputWidth
+and the dimension of each time step is blockHeight * blockWidth * channels.
+This layer can be used after convolution neural network, and before recurrent neural network.
 )DOC");
   }
 };
@@ -98,7 +118,7 @@ namespace ops = paddle::operators;
 REGISTER_OP(block_expand, ops::BlockExpandOp, ops::BlockExpandOpMaker,
             block_expand_grad, ops::BlockExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    block_expand, ops::BlockExpanddKernel<paddle::platform::CPUPlace, float>);
+    block_expand, ops::BlockExpandKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
     block_expand_grad,
     ops::BlockExpandGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index c0521dbbad..58f9e4c6ad 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -18,10 +18,26 @@
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/img2col.h"
 
 namespace paddle {
 namespace operators {
 
+inline void get_blockexpand_output_shape(int imgHeight, int imgWidth,
+                                         int blockHeight, int blockWidth,
+                                         int strideHeight, int strideWidth,
+                                         int paddingHeight, int paddingWidth,
+                                         int& outputHeight, int& outputWidth) {
+  outputHeight =
+      1 +
+      (imgHeight + 2 * paddingHeight - blockHeight + strideHeight - 1) /
+          strideHeight;
+
+  outputWidth = 1 +
+                (imgWidth + 2 * paddingWidth - blockWidth + strideWidth - 1) /
+                    strideWidth;
+}
+
 template <typename Place, typename T>
 class BlockExpandKernel : public framework::OpKernel<T> {
  public:
@@ -34,39 +50,30 @@ class BlockExpandKernel : public framework::OpKernel<T> {
     auto in_dim = in->dims();
     int N = in_dim[0];
     int C = in_dim[1];
-
-    int in_height = in_dim[2];
-    int in_width = in_dim[3];
-
-    int block_height = ctx.Attr<int>("block_height");
-    int block_width = ctx.Attr<int>("block_width");
-    int stride_height = ctx.Attr<int>("stride_height");
-    int stride_width = ctx.Attr<int>("stride_width");
-    int padding_height = ctx.Attr<int>("padding_height");
-    int padding_width = ctx.Attr<int>("padding_width");
-
-    int output_height =
-        1 +
-        (in_height + 2 * padding_height - block_height + stride_height - 1) /
-            stride_height;
-
-    int output_width =
-        1 +
-        (in_width + 2 * padding_width - block_width + stride_width - 1) /
-            stride_width;
-
-    Tensor col;
-    if (clo_format = KCFO) {
-      col.Resize(
-          {N, C, filter_height, filter_width, output_height, output_width});
-    } else {
-      col.Resize(
-          {N, output_height, output_width, C, filter_height, filter_width});
-    }
-
-    for (size_t i = 0; i < N; i++) {
-      Im2ColFunctor<col_format, place, T>(ctx, one_img, col, stride[0],
-                                          stride[1], padding[0], padding[1]);
+    int imgHeight = in_dim[2];
+    int imgWidth = in_dim[3];
+
+    int blockHeight = ctx.Attr<int>("blockHeight");
+    int blockWidth = ctx.Attr<int>("blockWidth");
+    int strideHeight = ctx.Attr<int>("strideHeight");
+    int strideWidth = ctx.Attr<int>("strideWidth");
+    int paddingHeight = ctx.Attr<int>("paddingHeight");
+    int paddingWidth = ctx.Attr<int>("paddingWidth");
+
+    int outputHeight = 0;
+    int outputWidth = 0;
+
+    get_blockexpand_output_shape(imgHeight, imgWidth, blockHeight, blockWidth,
+                                 strideHeight, strideWidth, paddingHeight,
+                                 paddingWidth, outputHeight, outputWidth);
+
+    for (int i = 0; i < N; i++) {
+      Tensor src = in->Slice<T>(i, i + 1).Resize(C, imgHeight, imgWidth);
+      Tensor dst = out->Slice<T>(i, i + 1).Resize(outputHeight, outputWidth, C,
+                                                  blockHeight, blockWidth);
+      math::Im2ColFunctor<kOCF, ctx->GetPlace(), T>(ctx, src, dst, strideHeight,
+                                                    strideWidth, paddingHeight,
+                                                    paddingWidth);
     }
   }
 };
@@ -75,13 +82,7 @@ template <typename Place, typename T>
 class BlockExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-    /*
-  int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
-  int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-  const Tensor* x = ctx.Input<Tensor>("X");
-  const Tensor* y = ctx.Input<Tensor>("Y");
-  */
+    using namespace framework;
   }
 };
 

From 6197c09bf92324696b237bf0320ce43d28097c70 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 16 Oct 2017 12:08:30 +0000
Subject: [PATCH 04/18] modify styles

---
 paddle/operators/block_expand_op.cc | 45 ++++++++++++------------
 paddle/operators/block_expand_op.h  | 53 +++++++++++++++--------------
 2 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index ec46737400..b3fad3c81f 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -33,32 +33,33 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
     PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
-    int blockHeight = ctx->Attrs().Get<int>("blockHeight");
-    int blockWidth = ctx->Attrs().Get<int>("blockWidth");
-    int strideHeight = ctx->Attrs().Get<int>("strideHeight");
-    int strideWidth = ctx->Attrs().Get<int>("strideWidth");
-    int paddingHeight = ctx->Attrs().Get<int>("paddingHeight");
-    int paddingWidth = ctx->Attrs().Get<int>("paddingWidth");
+    int block_height = ctx->Attrs().Get<int>("blockHeight");
+    int block_width = ctx->Attrs().Get<int>("blockWidth");
+    int stride_height = ctx->Attrs().Get<int>("strideHeight");
+    int stride_width = ctx->Attrs().Get<int>("strideWidth");
+    int padding_height = ctx->Attrs().Get<int>("paddingHeight");
+    int padding_width = ctx->Attrs().Get<int>("paddingWidth");
 
     int N = in_dim[0];
     int C = in_dim[1];
-    int imgHeight = in_dim[3];
-    int imgWidth = in_dim[4];
+    int img_height = in_dim[3];
+    int img_width = in_dim[4];
 
-    int outputHeight = 0;
-    int outputWidth = 0;
+    int output_height = 0;
+    int output_width = 0;
 
-    get_blockexpand_output_shape(imgHeight, imgWidth, blockHeight, blockWidth,
-                                 strideHeight, strideWidth, paddingHeight,
-                                 paddingWidth, outputHeight, outputWidth);
+    get_blockexpand_output_shape(img_height, img_width, block_height,
+                                 block_width, stride_height, stride_width,
+                                 padding_height, padding_width, output_height,
+                                 output_width);
 
-    // The result of im2col is [outputHeight, outputWidth,
+    // The result of im2col is [output_height, output_width,
     // inputChannels, filterHeight, filterWidth], and it is easy to
     // reshape into [seqLength, stepSize], where seqLength is equal
-    // outputHeight * outputWidth, stepSize is equal
+    // output_height * output_width, stepSize is equal
     // input_channels * blockHeight * blockWidth
     ctx->SetOutputDim(
-        "Out", {N, outputHeight, outputWidth, C, blockHeight, blockWidth});
+        "Out", {N, output_height, output_width, C, block_height, block_width});
 
     // ctx->ShareLoD("X", /*->*/ "Out");
   }
@@ -85,18 +86,18 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("paddingWidth", "(int)width of padding.");
     AddComment(R"DOC(
 Expand feature map to minibatch matrix.
-- matirx height is: outputHeight * outputWidth
+- matirx height is: output_height * output_width
 - matrix width is: blockHeight * blockWidth * channels
 
-outputHeight = 
-    1 + (2 * paddingHeight + imgHeight - blockHeight + strideHeight - 1) /
+output_height = 
+    1 + (2 * paddingHeight + img_height - blockHeight + strideHeight - 1) /
             strideHeight;
-outputWidth = 
-    1 + (2 * paddingWidth + imgWidth - blockWidth + strideWidth - 1) /
+output_width = 
+    1 + (2 * paddingWidth + img_width - blockWidth + strideWidth - 1) /
             strideWidth;
 
 The expand method is the same with ExpandConvLayer, but saved the transposed
-value. After expanding, The number of time steps are outputHeight * outputWidth
+value. After expanding, The number of time steps are output_height * output_width
 and the dimension of each time step is blockHeight * blockWidth * channels.
 This layer can be used after convolution neural network, and before recurrent neural network.
 )DOC");
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 58f9e4c6ad..bd6b307852 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -18,24 +18,25 @@
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/img2col.h"
+#include "paddle/operators/math/im2col.h"
 
 namespace paddle {
 namespace operators {
 
-inline void get_blockexpand_output_shape(int imgHeight, int imgWidth,
-                                         int blockHeight, int blockWidth,
-                                         int strideHeight, int strideWidth,
-                                         int paddingHeight, int paddingWidth,
+inline void get_blockexpand_output_shape(int img_height, int img_width,
+                                         int block_height, int block_width,
+                                         int stride_height, int stride_width,
+                                         int padding_height, int padding_width,
                                          int& outputHeight, int& outputWidth) {
   outputHeight =
       1 +
-      (imgHeight + 2 * paddingHeight - blockHeight + strideHeight - 1) /
-          strideHeight;
+      (img_height + 2 * padding_height - block_height + stride_height - 1) /
+          stride_height;
 
-  outputWidth = 1 +
-                (imgWidth + 2 * paddingWidth - blockWidth + strideWidth - 1) /
-                    strideWidth;
+  outputWidth =
+      1 +
+      (img_width + 2 * padding_width - block_width + stride_width - 1) /
+          stride_width;
 }
 
 template <typename Place, typename T>
@@ -50,30 +51,30 @@ class BlockExpandKernel : public framework::OpKernel<T> {
     auto in_dim = in->dims();
     int N = in_dim[0];
     int C = in_dim[1];
-    int imgHeight = in_dim[2];
-    int imgWidth = in_dim[3];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
 
-    int blockHeight = ctx.Attr<int>("blockHeight");
-    int blockWidth = ctx.Attr<int>("blockWidth");
-    int strideHeight = ctx.Attr<int>("strideHeight");
-    int strideWidth = ctx.Attr<int>("strideWidth");
-    int paddingHeight = ctx.Attr<int>("paddingHeight");
-    int paddingWidth = ctx.Attr<int>("paddingWidth");
+    int block_height = ctx.Attr<int>("blockHeight");
+    int block_width = ctx.Attr<int>("blockWidth");
+    int stride_height = ctx.Attr<int>("strideHeight");
+    int stride_width = ctx.Attr<int>("strideWidth");
+    int padding_height = ctx.Attr<int>("paddingHeight");
+    int padding_width = ctx.Attr<int>("paddingWidth");
 
     int outputHeight = 0;
     int outputWidth = 0;
 
-    get_blockexpand_output_shape(imgHeight, imgWidth, blockHeight, blockWidth,
-                                 strideHeight, strideWidth, paddingHeight,
-                                 paddingWidth, outputHeight, outputWidth);
+    get_blockexpand_output_shape(
+        img_height, img_width, block_height, block_width, stride_height,
+        stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
     for (int i = 0; i < N; i++) {
-      Tensor src = in->Slice<T>(i, i + 1).Resize(C, imgHeight, imgWidth);
+      Tensor src = in->Slice<T>(i, i + 1).Resize(C, img_height, img_width);
       Tensor dst = out->Slice<T>(i, i + 1).Resize(outputHeight, outputWidth, C,
-                                                  blockHeight, blockWidth);
-      math::Im2ColFunctor<kOCF, ctx->GetPlace(), T>(ctx, src, dst, strideHeight,
-                                                    strideWidth, paddingHeight,
-                                                    paddingWidth);
+                                                  block_height, block_width);
+      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T>(
+          ctx, src, dst, stride_height, stride_width, padding_height,
+          padding_width);
     }
   }
 };

From 5a9dd8ae5a5154a4e2a96becc057621a6221ca55 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 17 Oct 2017 03:48:34 +0000
Subject: [PATCH 05/18] add gpu

---
 paddle/operators/block_expand_op.cc | 15 ++++++++--
 paddle/operators/block_expand_op.cu | 24 +++++++++++++++
 paddle/operators/block_expand_op.h  | 46 +++++++++++++++++++++++++----
 3 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index b3fad3c81f..49c7011fe1 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -109,7 +109,18 @@ class BlockExpandGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {}
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    using namespace framework;
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output of BlockExpandOp op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto in_dim = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim(GradVarName("Out"), in_dim);
+  }
 };
 
 }  // namespace operators
@@ -117,7 +128,7 @@ class BlockExpandGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(block_expand, ops::BlockExpandOp, ops::BlockExpandOpMaker,
-            block_expand_grad, ops::BlockExpandOpGrad);
+            block_expand_grad, ops::BlockExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
     block_expand, ops::BlockExpandKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/operators/block_expand_op.cu b/paddle/operators/block_expand_op.cu
index e69de29bb2..492ac0c9b2 100644
--- a/paddle/operators/block_expand_op.cu
+++ b/paddle/operators/block_expand_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/block_expand_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    block_expand, ops::BlockExpandKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    block_expand_grad,
+    ops::BlockExpandGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index bd6b307852..b272582883 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -69,12 +69,12 @@ class BlockExpandKernel : public framework::OpKernel<T> {
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
     for (int i = 0; i < N; i++) {
-      Tensor src = in->Slice<T>(i, i + 1).Resize(C, img_height, img_width);
-      Tensor dst = out->Slice<T>(i, i + 1).Resize(outputHeight, outputWidth, C,
-                                                  block_height, block_width);
-      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T>(
-          ctx, src, dst, stride_height, stride_width, padding_height,
-          padding_width);
+      Tensor src = in->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
+      Tensor dst = out->Slice<T>(i, i + 1).Resize(
+          {outputHeight, outputWidth, C, block_height, block_width});
+      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
+      f(ctx.device_context(), src, dst, stride_height, stride_width,
+        padding_height, padding_width);
     }
   }
 };
@@ -84,6 +84,40 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
+    auto* in = ctx.Input<Tensor>("X");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* out_grad = ctx.Output<Tensor>(GradVarName("Out"));
+    out_grad->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dim = in->dims();
+    int N = in_dim[0];
+    int C = in_dim[1];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
+
+    int block_height = ctx.Attr<int>("blockHeight");
+    int block_width = ctx.Attr<int>("blockWidth");
+    int stride_height = ctx.Attr<int>("strideHeight");
+    int stride_width = ctx.Attr<int>("strideWidth");
+    int padding_height = ctx.Attr<int>("paddingHeight");
+    int padding_width = ctx.Attr<int>("paddingWidth");
+
+    int outputHeight = 0;
+    int outputWidth = 0;
+
+    get_blockexpand_output_shape(
+        img_height, img_width, block_height, block_width, stride_height,
+        stride_width, padding_height, padding_width, outputHeight, outputWidth);
+
+    for (int i = 0; i < N; i++) {
+      Tensor dst =
+          out_grad->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
+      Tensor src = out->Slice<T>(i, i + 1).Resize(
+          {outputHeight, outputWidth, C, block_height, block_width});
+      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
+      f(ctx.device_context(), src, dst, stride_height, stride_width,
+        padding_height, padding_width);
+    }
   }
 };
 

From 45f16c90456775d80ec3fbff5c87d17c06558c5b Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 17 Oct 2017 07:15:19 +0000
Subject: [PATCH 06/18] add py test

---
 .../framework/tests/test_block_expand_op.py   | 121 ++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 python/paddle/v2/framework/tests/test_block_expand_op.py

diff --git a/python/paddle/v2/framework/tests/test_block_expand_op.py b/python/paddle/v2/framework/tests/test_block_expand_op.py
new file mode 100644
index 0000000000..aa4fa479a9
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_block_expand_op.py
@@ -0,0 +1,121 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def get_output_shape(attrs, X):
+    img_height = X.shape[2]
+    img_width = X.shpe[3]
+    padding_height = attrs['padding_height']
+    padding_width = attrs['padding_width']
+    block_height = attrs['block_height']
+    block_width = attrs['block_width']
+    stride_height = attrs['stride_height']
+    stride_width = attrs['stride_width']
+    output_height = \
+      1 +  \
+      (img_height + 2 * padding_height - block_height + stride_height - 1) / \
+          stride_height
+
+    output_width = \
+      1 + \
+      (img_width + 2 * padding_width - block_width + stride_width - 1) / \
+          stride_width
+
+    return output_height, output_width
+
+
+"""
+img: {CHW}
+col:
+    {output_height, output_width, inputChannels, filterHeight, filterWidth}
+"""
+
+
+def img2col(attrs, im, col):
+    input_channels = im.shape.dims[0]
+    input_height = im.shape.dims[1]
+    input_width = im.shape.dims[2]
+    filter_height = col.shape.dims[3]
+    filter_width = col.shape.dims[4]
+    output_height = col.shape.dims[0]
+    output_width = col.shape.dims[1]
+
+    for col_row_idx in range(0, output_height):
+        for col_col_idx in range(0, output_width):
+            for channel in range(0, input_channels):
+                for filter_row_idx in range(0, filter_height):
+                    for filter_col_idx in range(0, filter_width):
+                        im_row_offset = col_row_idx * stride_height \
+                            + filter_row_idx - padding_height
+                        im_col_offset = col_col_idx * stride_width \
+                            + filter_col_idx - padding_width
+                        if (im_row_offset < 0 or
+                                im_row_offset >= input_height or
+                                im_col_offset < 0 or
+                                im_col_offset >= input_width):
+                            col[col_row_idx][col_col_idx][channel][
+                                filter_row_idx][filter_col_idx] = 0.0
+                        else:
+                            im_offset = (channel * input_height + im_row_offset
+                                         ) * input_width + im_col_offset
+                            col[col_row_idx][col_col_idx][channel][
+                                filter_row_idx][filter_col_idx] = im[channel][
+                                    im_row_offset][im_col_offset]
+
+
+"""
+img: {CHW}
+col:
+    {output_height, output_width, inputChannels, filterHeight, filterWidth}
+"""
+
+
+def col2img(attrs, col, img):
+    input_channels = im.shape.dims[0]
+    input_height = im.shape.dims[1]
+    input_width = im.shape.dims[2]
+    filter_height = col.shape.dims[3]
+    filter_width = col.shape.dims[4]
+    output_height = col.shape.dims[0]
+    output_width = col.shape.dims[1]
+
+    for col_row_idx in range(0, output_height):
+        for col_col_idx in range(0, output_width):
+            for channel in range(0, input_channels):
+                for filter_row_idx in range(0, filter_height):
+                    for filter_col_idx in range(0, filter_width):
+                        im_row_offset = \
+                            col_row_idx * stride_height + filter_row_idx - padding_height
+                        im_col_offset = \
+                            col_col_idx * stride_width + filter_col_idx - padding_width
+                        if (im_row_offset >= 0 and
+                                im_row_offset < input_height and
+                                im_col_offset >= 0 and
+                                im_col_offset < input_width):
+                            im[channel][im_row_offset][im_col_offset] = \
+                                col[col_row_idx][col_col_idx][channel][filter_row_idx][filter_col_idx]
+
+
+class TestBlockExpandMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "block_expand"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 9, 9]).astype("float64"),
+        }
+        self.attrs = {
+            'block_height': 3,
+            'block_width': 3,
+            'stride_height': 2,
+            'stride_width': 2,
+            'padding_height': 3,
+            'padding_width': 3,
+        }
+
+        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')

From 32db8db51c5384f213a0b1402d2632519da5416a Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 17 Oct 2017 08:12:56 +0000
Subject: [PATCH 07/18] fix bugs

---
 paddle/operators/block_expand_op.cc           |   9 +-
 paddle/operators/block_expand_op.h            |   9 +-
 .../framework/tests/test_block_expand_op.py   | 176 +++++++++++-------
 3 files changed, 120 insertions(+), 74 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index 49c7011fe1..37ea57f393 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -23,6 +23,7 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
+    printf("op infershape\n");
     using namespace framework;
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input of BlockExpandOp should not be null.");
@@ -33,6 +34,7 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
     PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
+    printf("op infershape2\n");
     int block_height = ctx->Attrs().Get<int>("blockHeight");
     int block_width = ctx->Attrs().Get<int>("blockWidth");
     int stride_height = ctx->Attrs().Get<int>("strideHeight");
@@ -42,8 +44,8 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
     int N = in_dim[0];
     int C = in_dim[1];
-    int img_height = in_dim[3];
-    int img_width = in_dim[4];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
 
     int output_height = 0;
     int output_width = 0;
@@ -58,6 +60,8 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     // reshape into [seqLength, stepSize], where seqLength is equal
     // output_height * output_width, stepSize is equal
     // input_channels * blockHeight * blockWidth
+    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, output_height,
+           output_width, C, block_height, block_width);
     ctx->SetOutputDim(
         "Out", {N, output_height, output_width, C, block_height, block_width});
 
@@ -77,6 +81,7 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     H: height
     W: width
 )DOC");
+    printf("opmakeer\n");
     AddOutput("Out", "(LodTensor)The output data of block_expand op,");
     AddAttr<int>("blockHeight", "(int)height of block.");
     AddAttr<int>("blockWidth", "(int)width of block.");
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index b272582883..69bd7d6987 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -44,7 +44,7 @@ class BlockExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
-    const Tensor* in = ctx.Input<Tensor>("input");
+    const Tensor* in = ctx.Input<Tensor>("X");
     Tensor* out = ctx.Output<Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -68,7 +68,11 @@ class BlockExpandKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
+    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, outputHeight,
+           outputWidth, C, block_height, block_width);
+
     for (int i = 0; i < N; i++) {
+      printf("i:%d\n", i);
       Tensor src = in->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
       Tensor dst = out->Slice<T>(i, i + 1).Resize(
           {outputHeight, outputWidth, C, block_height, block_width});
@@ -109,6 +113,9 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
+    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, outputHeight,
+           outputWidth, C, block_height, block_width);
+
     for (int i = 0; i < N; i++) {
       Tensor dst =
           out_grad->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
diff --git a/python/paddle/v2/framework/tests/test_block_expand_op.py b/python/paddle/v2/framework/tests/test_block_expand_op.py
index aa4fa479a9..f8f4afc880 100644
--- a/python/paddle/v2/framework/tests/test_block_expand_op.py
+++ b/python/paddle/v2/framework/tests/test_block_expand_op.py
@@ -3,119 +3,153 @@ import numpy as np
 from op_test import OpTest
 
 
-def get_output_shape(attrs, X):
-    img_height = X.shape[2]
-    img_width = X.shpe[3]
-    padding_height = attrs['padding_height']
-    padding_width = attrs['padding_width']
-    block_height = attrs['block_height']
-    block_width = attrs['block_width']
-    stride_height = attrs['stride_height']
-    stride_width = attrs['stride_width']
-    output_height = \
+def get_output_shape(attrs, x):
+    imgHeight = x.shape[1]
+    imgWidth = x.shape[2]
+
+    paddingHeight = attrs['paddingHeight']
+    paddingWidth = attrs['paddingWidth']
+    blockHeight = attrs['blockHeight']
+    blockWidth = attrs['blockWidth']
+    strideHeight = attrs['strideHeight']
+    strideWidth = attrs['strideWidth']
+
+    outputHeight = \
       1 +  \
-      (img_height + 2 * padding_height - block_height + stride_height - 1) / \
-          stride_height
+      (imgHeight + 2 * paddingHeight - blockHeight + strideHeight - 1) / \
+          strideHeight
 
-    output_width = \
+    outputWidth = \
       1 + \
-      (img_width + 2 * padding_width - block_width + stride_width - 1) / \
-          stride_width
+      (imgWidth + 2 * paddingWidth - blockWidth + strideWidth - 1) / \
+          strideWidth
 
-    return output_height, output_width
+    return outputHeight, outputWidth
 
 
 """
-img: {CHW}
+im: {CHW}
 col:
-    {output_height, output_width, inputChannels, filterHeight, filterWidth}
+    {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
 """
 
 
-def img2col(attrs, im, col):
-    input_channels = im.shape.dims[0]
-    input_height = im.shape.dims[1]
-    input_width = im.shape.dims[2]
-    filter_height = col.shape.dims[3]
-    filter_width = col.shape.dims[4]
-    output_height = col.shape.dims[0]
-    output_width = col.shape.dims[1]
+def im2col(attrs, im, col):
+    input_channels = im.shape[0]
+    inputHeight = im.shape[1]
+    inputWidth = im.shape[2]
+
+    outputHeight = col.shape[0]
+    outputWidth = col.shape[1]
+    filterHeight = col.shape[3]
+    filterWidth = col.shape[4]
 
-    for col_row_idx in range(0, output_height):
-        for col_col_idx in range(0, output_width):
+    strideHeight = attrs['strideHeight']
+    strideWidth = attrs['strideWidth']
+    paddingHeight = attrs['paddingHeight']
+    paddingWidth = attrs['paddingWidth']
+
+    for col_row_idx in range(0, outputHeight):
+        for col_col_idx in range(0, outputWidth):
             for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filter_height):
-                    for filter_col_idx in range(0, filter_width):
-                        im_row_offset = col_row_idx * stride_height \
-                            + filter_row_idx - padding_height
-                        im_col_offset = col_col_idx * stride_width \
-                            + filter_col_idx - padding_width
-                        if (im_row_offset < 0 or
-                                im_row_offset >= input_height or
+                for filter_row_idx in range(0, filterHeight):
+                    for filter_col_idx in range(0, filterWidth):
+                        im_row_offset = col_row_idx * strideHeight \
+                            + filter_row_idx - paddingHeight
+
+                        im_col_offset = col_col_idx * strideWidth \
+                            + filter_col_idx - paddingWidth
+
+                        if (im_row_offset < 0 or im_row_offset >= inputHeight or
                                 im_col_offset < 0 or
-                                im_col_offset >= input_width):
-                            col[col_row_idx][col_col_idx][channel][
+                                im_col_offset >= inputWidth):
+                            col[col_row_idx][col_col_idx][channel][\
                                 filter_row_idx][filter_col_idx] = 0.0
                         else:
-                            im_offset = (channel * input_height + im_row_offset
-                                         ) * input_width + im_col_offset
-                            col[col_row_idx][col_col_idx][channel][
-                                filter_row_idx][filter_col_idx] = im[channel][
+                            im_offset = (channel * inputHeight + im_row_offset \
+                                         ) * inputWidth + im_col_offset
+
+                            col[col_row_idx][col_col_idx][channel][\
+                                filter_row_idx][filter_col_idx] = im[channel][ \
                                     im_row_offset][im_col_offset]
 
 
 """
 img: {CHW}
 col:
-    {output_height, output_width, inputChannels, filterHeight, filterWidth}
+    {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
 """
 
 
 def col2img(attrs, col, img):
-    input_channels = im.shape.dims[0]
-    input_height = im.shape.dims[1]
-    input_width = im.shape.dims[2]
-    filter_height = col.shape.dims[3]
-    filter_width = col.shape.dims[4]
-    output_height = col.shape.dims[0]
-    output_width = col.shape.dims[1]
-
-    for col_row_idx in range(0, output_height):
-        for col_col_idx in range(0, output_width):
+    input_channels = im.shape[0]
+    inputHeight = im.shape[1]
+    inputWidth = im.shape[2]
+
+    outputHeight = col.shape[0]
+    outputWidth = col.shape[1]
+    filterHeight = col.shape[3]
+    filterWidth = col.shape[4]
+
+    strideHeight = attrs['strideHeight']
+    strideWidth = attrs['strideWidth']
+    paddingHeight = attrs['paddingHeight']
+    paddingWidth = attrs['paddingWidth']
+
+    for col_row_idx in range(0, outputHeight):
+        for col_col_idx in range(0, outputWidth):
             for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filter_height):
-                    for filter_col_idx in range(0, filter_width):
+                for filter_row_idx in range(0, filterHeight):
+                    for filter_col_idx in range(0, filterWidth):
                         im_row_offset = \
-                            col_row_idx * stride_height + filter_row_idx - padding_height
+                            col_row_idx * strideHeight + filter_row_idx - paddingHeight
                         im_col_offset = \
-                            col_col_idx * stride_width + filter_col_idx - padding_width
+                            col_col_idx * strideWidth + filter_col_idx - paddingWidth
                         if (im_row_offset >= 0 and
-                                im_row_offset < input_height and
+                                im_row_offset < inputHeight and
                                 im_col_offset >= 0 and
-                                im_col_offset < input_width):
+                                im_col_offset < inputWidth):
                             im[channel][im_row_offset][im_col_offset] = \
                                 col[col_row_idx][col_col_idx][channel][filter_row_idx][filter_col_idx]
 
 
 class TestBlockExpandMulOp(OpTest):
     def setUp(self):
-        self.op_type = "block_expand"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 9, 9]).astype("float64"),
-        }
-        self.attrs = {
-            'block_height': 3,
-            'block_width': 3,
-            'stride_height': 2,
-            'stride_width': 2,
-            'padding_height': 3,
-            'padding_width': 3,
+        x = np.random.uniform(0.1, 1, [3, 9, 9]).astype("float32")
+        attrs = {
+            'blockHeight': 3,
+            'blockWidth': 3,
+            'strideHeight': 2,
+            'strideWidth': 2,
+            'paddingHeight': 3,
+            'paddingWidth': 3,
         }
 
-        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+        outputHeight, outputWidth = get_output_shape(attrs, x)
+        out = np.random.uniform(0.1, 1,\
+                    [outputHeight, outputWidth, x.shape[0], \
+                     attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
+
+        self.op_type = "block_expand"
+        self.inputs = {'X': x.reshape(1, 3, 9, 9)}
+        self.attrs = attrs
+
+        im2col(attrs, x, out)
+        self.outputs = {
+            'Out':out.reshape(1, outputHeight, outputWidth, x.shape[0], \
+                     attrs['blockHeight'], attrs['blockWidth'])
+            }
+        #print out
 
     def test_check_output(self):
         self.check_output()
+        print 1
 
+    """
     def test_check_grad_normal(self):
         self.check_grad(['X'], 'Out')
+    """
+
+
+if __name__ == '__main__':
+    unittest.main()

From d3ac3393fc803d210a1bab4f89249657b2e8786c Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 17 Oct 2017 12:43:32 +0000
Subject: [PATCH 08/18] fix bugs

---
 paddle/operators/block_expand_op.cc           | 14 +------
 paddle/operators/block_expand_op.h            | 25 +++++-------
 .../framework/tests/test_block_expand_op.py   | 40 ++++++++++++-------
 3 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index 37ea57f393..d72c6b2de1 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -23,7 +23,6 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    printf("op infershape\n");
     using namespace framework;
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input of BlockExpandOp should not be null.");
@@ -34,7 +33,6 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
     PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
-    printf("op infershape2\n");
     int block_height = ctx->Attrs().Get<int>("blockHeight");
     int block_width = ctx->Attrs().Get<int>("blockWidth");
     int stride_height = ctx->Attrs().Get<int>("strideHeight");
@@ -60,8 +58,6 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     // reshape into [seqLength, stepSize], where seqLength is equal
     // output_height * output_width, stepSize is equal
     // input_channels * blockHeight * blockWidth
-    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, output_height,
-           output_width, C, block_height, block_width);
     ctx->SetOutputDim(
         "Out", {N, output_height, output_width, C, block_height, block_width});
 
@@ -81,7 +77,6 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     H: height
     W: width
 )DOC");
-    printf("opmakeer\n");
     AddOutput("Out", "(LodTensor)The output data of block_expand op,");
     AddAttr<int>("blockHeight", "(int)height of block.");
     AddAttr<int>("blockWidth", "(int)width of block.");
@@ -117,14 +112,9 @@ class BlockExpandGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     using namespace framework;
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output of BlockExpandOp op should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-
-    auto in_dim = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim(GradVarName("Out"), in_dim);
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
 
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 69bd7d6987..38d0626c73 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -68,11 +68,7 @@ class BlockExpandKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
-    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, outputHeight,
-           outputWidth, C, block_height, block_width);
-
     for (int i = 0; i < N; i++) {
-      printf("i:%d\n", i);
       Tensor src = in->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
       Tensor dst = out->Slice<T>(i, i + 1).Resize(
           {outputHeight, outputWidth, C, block_height, block_width});
@@ -89,9 +85,12 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
     auto* in = ctx.Input<Tensor>("X");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* out_grad = ctx.Output<Tensor>(GradVarName("Out"));
-    out_grad->mutable_data<T>(ctx.GetPlace());
+    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<Tensor>(GradVarName("X"));
+    d_x->mutable_data<T>(ctx.GetPlace());
+
+    auto x_v = framework::EigenVector<T>::Flatten(*d_x);
+    x_v.device(ctx.GetEigenDevice<Place>()) = x_v.constant(0.0);
 
     auto in_dim = in->dims();
     int N = in_dim[0];
@@ -113,16 +112,12 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
-    printf("N:%d, o_h:%d o_w:%d C:%d b_h:%d b_w:%d\n", N, outputHeight,
-           outputWidth, C, block_height, block_width);
-
     for (int i = 0; i < N; i++) {
-      Tensor dst =
-          out_grad->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
-      Tensor src = out->Slice<T>(i, i + 1).Resize(
+      Tensor dst = d_x->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
+      Tensor src = d_out->Slice<T>(i, i + 1).Resize(
           {outputHeight, outputWidth, C, block_height, block_width});
-      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), src, dst, stride_height, stride_width,
+      math::Col2ImFunctor<math::ColFormat::kOCF, Place, T> f;
+      f(ctx.device_context(), dst, src, stride_height, stride_width,
         padding_height, padding_width);
     }
   }
diff --git a/python/paddle/v2/framework/tests/test_block_expand_op.py b/python/paddle/v2/framework/tests/test_block_expand_op.py
index f8f4afc880..c85f3a1ef1 100644
--- a/python/paddle/v2/framework/tests/test_block_expand_op.py
+++ b/python/paddle/v2/framework/tests/test_block_expand_op.py
@@ -113,16 +113,30 @@ def col2img(attrs, col, img):
                                 col[col_row_idx][col_col_idx][channel][filter_row_idx][filter_col_idx]
 
 
-class TestBlockExpandMulOp(OpTest):
+class TestBlockExpandOp(OpTest):
+    def get_input_data(self, C, H, W):
+        x = np.random.uniform(0.1, 1, [C, H, W]).astype("float32")
+        for c in range(0, C):
+            for h in range(0, H):
+                for w in range(0, W):
+                    #x[c][h][w] = c * H * W + h *W + w
+                    x[c][h][w] = 0.2 + 0.01 * (c * H * W + h * W + w)
+        return x
+
     def setUp(self):
-        x = np.random.uniform(0.1, 1, [3, 9, 9]).astype("float32")
+        C = 3
+        H = 4
+        W = 4
+        x = self.get_input_data(C, H, W)
+        #print x
+
         attrs = {
-            'blockHeight': 3,
-            'blockWidth': 3,
-            'strideHeight': 2,
-            'strideWidth': 2,
-            'paddingHeight': 3,
-            'paddingWidth': 3,
+            'blockHeight': 2,
+            'blockWidth': 2,
+            'strideHeight': 1,
+            'strideWidth': 1,
+            'paddingHeight': 1,
+            'paddingWidth': 1,
         }
 
         outputHeight, outputWidth = get_output_shape(attrs, x)
@@ -131,7 +145,7 @@ class TestBlockExpandMulOp(OpTest):
                      attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
 
         self.op_type = "block_expand"
-        self.inputs = {'X': x.reshape(1, 3, 9, 9)}
+        self.inputs = {'X': x.reshape(1, C, H, W)}
         self.attrs = attrs
 
         im2col(attrs, x, out)
@@ -139,16 +153,14 @@ class TestBlockExpandMulOp(OpTest):
             'Out':out.reshape(1, outputHeight, outputWidth, x.shape[0], \
                      attrs['blockHeight'], attrs['blockWidth'])
             }
-        #print out
 
+    """
     def test_check_output(self):
         self.check_output()
-        print 1
-
     """
+
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
-    """
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
 if __name__ == '__main__':

From 4422a556dca7c9461dd7fdcf91b96a3e429aaf66 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 17 Oct 2017 12:46:33 +0000
Subject: [PATCH 09/18] rm not need

---
 .../framework/tests/test_block_expand_op.py   | 28 ++++++++-----------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_block_expand_op.py b/python/paddle/v2/framework/tests/test_block_expand_op.py
index c85f3a1ef1..4c66493d6e 100644
--- a/python/paddle/v2/framework/tests/test_block_expand_op.py
+++ b/python/paddle/v2/framework/tests/test_block_expand_op.py
@@ -27,14 +27,12 @@ def get_output_shape(attrs, x):
     return outputHeight, outputWidth
 
 
-"""
-im: {CHW}
-col:
-    {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
-"""
-
-
 def im2col(attrs, im, col):
+    """
+    im: {CHW}
+    col:
+        {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
+    """
     input_channels = im.shape[0]
     inputHeight = im.shape[1]
     inputWidth = im.shape[2]
@@ -74,14 +72,12 @@ def im2col(attrs, im, col):
                                     im_row_offset][im_col_offset]
 
 
-"""
-img: {CHW}
-col:
-    {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
-"""
-
-
 def col2img(attrs, col, img):
+    """
+    img: {CHW}
+    col:
+        {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
+    """
     input_channels = im.shape[0]
     inputHeight = im.shape[1]
     inputWidth = im.shape[2]
@@ -154,13 +150,11 @@ class TestBlockExpandOp(OpTest):
                      attrs['blockHeight'], attrs['blockWidth'])
             }
 
-    """
     def test_check_output(self):
         self.check_output()
-    """
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+        self.check_grad(['X'], 'Out')
 
 
 if __name__ == '__main__':

From dbe0583cb0e79bfb156a9816b1ae2e5dfaf2c383 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 21 Nov 2017 09:05:33 +0000
Subject: [PATCH 10/18] mv test position to fluid

---
 .../paddle/v2/{framework => fluid}/tests/test_block_expand_op.py  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename python/paddle/v2/{framework => fluid}/tests/test_block_expand_op.py (100%)

diff --git a/python/paddle/v2/framework/tests/test_block_expand_op.py b/python/paddle/v2/fluid/tests/test_block_expand_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_block_expand_op.py
rename to python/paddle/v2/fluid/tests/test_block_expand_op.py

From 25a3d2d76f0146ac580cb484bb5a638ddc029bfa Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 22 Nov 2017 06:18:47 +0000
Subject: [PATCH 11/18] fix by comments

---
 paddle/operators/block_expand_op.cc           |   3 +-
 paddle/operators/block_expand_op.h            |  22 ++-
 .../v2/fluid/tests/test_block_expand_op.py    | 175 +++++++++++-------
 3 files changed, 123 insertions(+), 77 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index d72c6b2de1..f25cc4f9de 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -30,7 +30,8 @@ class BlockExpandOp : public framework::OperatorWithKernel {
                    "Output of BlockExpandOp op should not be null.");
 
     auto in_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input format  must be NCHW.");
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4,
+                      "Input(X) format  must be 4D tensor, eg., NCHW.");
     PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
     int block_height = ctx->Attrs().Get<int>("blockHeight");
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 38d0626c73..aa0db2705c 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -68,13 +68,16 @@ class BlockExpandKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
+    std::vector<int> stride({stride_height, stride_width});
+    std::vector<int> padding({padding_height, padding_width});
+
     for (int i = 0; i < N; i++) {
-      Tensor src = in->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
-      Tensor dst = out->Slice<T>(i, i + 1).Resize(
+      Tensor src = in->Slice(i, i + 1).Resize({C, img_height, img_width});
+      Tensor dst = out->Slice(i, i + 1).Resize(
           {outputHeight, outputWidth, C, block_height, block_width});
+
       math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), src, dst, stride_height, stride_width,
-        padding_height, padding_width);
+      f(ctx.device_context(), src, stride, padding, &dst);
     }
   }
 };
@@ -112,13 +115,16 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
         img_height, img_width, block_height, block_width, stride_height,
         stride_width, padding_height, padding_width, outputHeight, outputWidth);
 
+    std::vector<int> stride({stride_height, stride_width});
+    std::vector<int> padding({padding_height, padding_width});
+    // std::vector<int> stride({stride_height, stride_width});
+
     for (int i = 0; i < N; i++) {
-      Tensor dst = d_x->Slice<T>(i, i + 1).Resize({C, img_height, img_width});
-      Tensor src = d_out->Slice<T>(i, i + 1).Resize(
+      Tensor dst = d_x->Slice(i, i + 1).Resize({C, img_height, img_width});
+      Tensor src = d_out->Slice(i, i + 1).Resize(
           {outputHeight, outputWidth, C, block_height, block_width});
       math::Col2ImFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), dst, src, stride_height, stride_width,
-        padding_height, padding_width);
+      f(ctx.device_context(), dst, stride, padding, &src);
     }
   }
 };
diff --git a/python/paddle/v2/fluid/tests/test_block_expand_op.py b/python/paddle/v2/fluid/tests/test_block_expand_op.py
index 4c66493d6e..b31ed53f4c 100644
--- a/python/paddle/v2/fluid/tests/test_block_expand_op.py
+++ b/python/paddle/v2/fluid/tests/test_block_expand_op.py
@@ -4,27 +4,27 @@ from op_test import OpTest
 
 
 def get_output_shape(attrs, x):
-    imgHeight = x.shape[1]
-    imgWidth = x.shape[2]
+    img_height = x.shape[1]
+    img_width = x.shape[2]
 
-    paddingHeight = attrs['paddingHeight']
-    paddingWidth = attrs['paddingWidth']
-    blockHeight = attrs['blockHeight']
-    blockWidth = attrs['blockWidth']
-    strideHeight = attrs['strideHeight']
-    strideWidth = attrs['strideWidth']
+    padding_height = attrs['paddingHeight']
+    padding_width = attrs['paddingWidth']
+    block_height = attrs['blockHeight']
+    block_width = attrs['blockWidth']
+    stride_height = attrs['strideHeight']
+    stride_width = attrs['strideWidth']
 
-    outputHeight = \
+    output_height = \
       1 +  \
-      (imgHeight + 2 * paddingHeight - blockHeight + strideHeight - 1) / \
+      (img_height + 2 * padding_height - block_height + stride_height - 1) / \
           strideHeight
 
-    outputWidth = \
+    output_width = \
       1 + \
-      (imgWidth + 2 * paddingWidth - blockWidth + strideWidth - 1) / \
-          strideWidth
+      (img_width + 2 * padding_width - block_width + stride_width - 1) / \
+          stride_width
 
-    return outputHeight, outputWidth
+    return output_height, output_width
 
 
 def im2col(attrs, im, col):
@@ -34,38 +34,39 @@ def im2col(attrs, im, col):
         {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
     """
     input_channels = im.shape[0]
-    inputHeight = im.shape[1]
-    inputWidth = im.shape[2]
+    input_height = im.shape[1]
+    input_width = im.shape[2]
 
-    outputHeight = col.shape[0]
-    outputWidth = col.shape[1]
-    filterHeight = col.shape[3]
-    filterWidth = col.shape[4]
+    output_height = col.shape[0]
+    output_width = col.shape[1]
+    filter_height = col.shape[3]
+    filter_width = col.shape[4]
 
-    strideHeight = attrs['strideHeight']
-    strideWidth = attrs['strideWidth']
-    paddingHeight = attrs['paddingHeight']
-    paddingWidth = attrs['paddingWidth']
+    stride_height = attrs['strideHeight']
+    stride_width = attrs['strideWidth']
+    padding_height = attrs['paddingHeight']
+    padding_width = attrs['paddingWidth']
 
-    for col_row_idx in range(0, outputHeight):
-        for col_col_idx in range(0, outputWidth):
+    for col_row_idx in range(0, output_height):
+        for col_col_idx in range(0, output_width):
             for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filterHeight):
-                    for filter_col_idx in range(0, filterWidth):
-                        im_row_offset = col_row_idx * strideHeight \
-                            + filter_row_idx - paddingHeight
+                for filter_row_idx in range(0, filter_height):
+                    for filter_col_idx in range(0, filter_width):
+                        im_row_offset = col_row_idx * stride_height \
+                            + filter_row_idx - padding_height
 
-                        im_col_offset = col_col_idx * strideWidth \
-                            + filter_col_idx - paddingWidth
+                        im_col_offset = col_col_idx * stride_width \
+                            + filter_col_idx - padding_width
 
-                        if (im_row_offset < 0 or im_row_offset >= inputHeight or
+                        if (im_row_offset < 0 or
+                                im_row_offset >= input_height or
                                 im_col_offset < 0 or
-                                im_col_offset >= inputWidth):
+                                im_col_offset >= input_width):
                             col[col_row_idx][col_col_idx][channel][\
                                 filter_row_idx][filter_col_idx] = 0.0
                         else:
-                            im_offset = (channel * inputHeight + im_row_offset \
-                                         ) * inputWidth + im_col_offset
+                            im_offset = (channel * input_height + im_row_offset \
+                                         ) * input_width + im_col_offset
 
                             col[col_row_idx][col_col_idx][channel][\
                                 filter_row_idx][filter_col_idx] = im[channel][ \
@@ -76,55 +77,55 @@ def col2img(attrs, col, img):
     """
     img: {CHW}
     col:
-        {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
+        {output_height, outputWidth, inputChannels, filterHeight, filterWidth}
     """
     input_channels = im.shape[0]
-    inputHeight = im.shape[1]
-    inputWidth = im.shape[2]
+    input_height = im.shape[1]
+    input_width = im.shape[2]
 
-    outputHeight = col.shape[0]
-    outputWidth = col.shape[1]
-    filterHeight = col.shape[3]
-    filterWidth = col.shape[4]
+    output_height = col.shape[0]
+    output_width = col.shape[1]
+    filter_height = col.shape[3]
+    filter_width = col.shape[4]
 
-    strideHeight = attrs['strideHeight']
-    strideWidth = attrs['strideWidth']
-    paddingHeight = attrs['paddingHeight']
-    paddingWidth = attrs['paddingWidth']
+    stride_height = attrs['strideHeight']
+    stride_width = attrs['strideWidth']
+    padding_height = attrs['paddingHeight']
+    padding_width = attrs['paddingWidth']
 
-    for col_row_idx in range(0, outputHeight):
-        for col_col_idx in range(0, outputWidth):
+    for col_row_idx in range(0, output_height):
+        for col_col_idx in range(0, output_width):
             for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filterHeight):
-                    for filter_col_idx in range(0, filterWidth):
+                for filter_row_idx in range(0, filter_height):
+                    for filter_col_idx in range(0, filter_width):
                         im_row_offset = \
-                            col_row_idx * strideHeight + filter_row_idx - paddingHeight
+                            col_row_idx * stride_height + filter_row_idx - padding_height
                         im_col_offset = \
-                            col_col_idx * strideWidth + filter_col_idx - paddingWidth
+                            col_col_idx * stride_width + filter_col_idx - padding_width
                         if (im_row_offset >= 0 and
-                                im_row_offset < inputHeight and
+                                im_row_offset < input_height and
                                 im_col_offset >= 0 and
-                                im_col_offset < inputWidth):
+                                im_col_offset < input_width):
                             im[channel][im_row_offset][im_col_offset] = \
                                 col[col_row_idx][col_col_idx][channel][filter_row_idx][filter_col_idx]
 
 
-class TestBlockExpandOp(OpTest):
-    def get_input_data(self, C, H, W):
-        x = np.random.uniform(0.1, 1, [C, H, W]).astype("float32")
-        for c in range(0, C):
-            for h in range(0, H):
-                for w in range(0, W):
-                    #x[c][h][w] = c * H * W + h *W + w
-                    x[c][h][w] = 0.2 + 0.01 * (c * H * W + h * W + w)
+def get_input_data(C, H, W):
+    x = np.random.uniform(0.1, 1, [C, H, W]).astype("float32")
+    for c in range(0, C):
+        for h in range(0, H):
+            for w in range(0, W):
+                #x[c][h][w] = c * H * W + h *W + w
+                x[c][h][w] = 0.2 + 0.01 * (c * H * W + h * W + w)
         return x
 
+
+class TestBlockExpandOp(OpTest):
     def setUp(self):
         C = 3
         H = 4
         W = 4
-        x = self.get_input_data(C, H, W)
-        #print x
+        x = get_input_data(C, H, W)
 
         attrs = {
             'blockHeight': 2,
@@ -135,9 +136,47 @@ class TestBlockExpandOp(OpTest):
             'paddingWidth': 1,
         }
 
-        outputHeight, outputWidth = get_output_shape(attrs, x)
+        output_height, output_width = get_output_shape(attrs, x)
+        out = np.random.uniform(0.1, 1,\
+                    [output_height, output_width, x.shape[0], \
+                     attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
+
+        self.op_type = "block_expand"
+        self.inputs = {'X': x.reshape(1, C, H, W)}
+        self.attrs = attrs
+
+        im2col(attrs, x, out)
+        self.outputs = {
+            'Out':out.reshape(1, output_height, output_width, x.shape[0], \
+                     attrs['blockHeight'], attrs['blockWidth'])
+            }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestBlockExpandOp2(OpTest):
+    def setUp(self):
+        C = 3
+        H = 4
+        W = 5
+        x = get_input_data(C, H, W)
+
+        attrs = {
+            'blockHeight': 2,
+            'blockWidth': 1,
+            'strideHeight': 2,
+            'strideWidth': 1,
+            'paddingHeight': 2,
+            'paddingWidth': 1,
+        }
+
+        output_height, output_width = get_output_shape(attrs, x)
         out = np.random.uniform(0.1, 1,\
-                    [outputHeight, outputWidth, x.shape[0], \
+                    [output_height, output_width, x.shape[0], \
                      attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
 
         self.op_type = "block_expand"
@@ -146,7 +185,7 @@ class TestBlockExpandOp(OpTest):
 
         im2col(attrs, x, out)
         self.outputs = {
-            'Out':out.reshape(1, outputHeight, outputWidth, x.shape[0], \
+            'Out':out.reshape(1, output_height, output_width, x.shape[0], \
                      attrs['blockHeight'], attrs['blockWidth'])
             }
 

From e82f1008a82232936529ec4bba70a59880915912 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 17 Jan 2018 00:42:20 +0800
Subject: [PATCH 12/18] Finish block expand op 1. Add lod to output 2. Fix
 im2col arguments list 3. Refine code and doc 4. Fix output shape

---
 paddle/operators/block_expand_op.cc           | 119 +++++++----
 paddle/operators/block_expand_op.h            | 140 ++++++------
 .../v2/fluid/tests/test_block_expand_op.py    | 202 ++++++++----------
 3 files changed, 239 insertions(+), 222 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index f25cc4f9de..317a43bb7b 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -32,37 +32,27 @@ class BlockExpandOp : public framework::OperatorWithKernel {
     auto in_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(in_dim.size(), 4,
                       "Input(X) format  must be 4D tensor, eg., NCHW.");
-    PADDLE_ENFORCE_GE(in_dim[0], 1, "Input batchsize must >= 1.");
 
-    int block_height = ctx->Attrs().Get<int>("blockHeight");
-    int block_width = ctx->Attrs().Get<int>("blockWidth");
-    int stride_height = ctx->Attrs().Get<int>("strideHeight");
-    int stride_width = ctx->Attrs().Get<int>("strideWidth");
-    int padding_height = ctx->Attrs().Get<int>("paddingHeight");
-    int padding_width = ctx->Attrs().Get<int>("paddingWidth");
+    int block_height = ctx->Attrs().Get<int>("block_height");
+    int block_width = ctx->Attrs().Get<int>("block_width");
+    int stride_height = ctx->Attrs().Get<int>("stride_height");
+    int stride_width = ctx->Attrs().Get<int>("stride_width");
+    int padding_height = ctx->Attrs().Get<int>("padding_height");
+    int padding_width = ctx->Attrs().Get<int>("padding_width");
 
-    int N = in_dim[0];
-    int C = in_dim[1];
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    int output_height = 0;
-    int output_width = 0;
+    int output_height = get_output_size(img_height, block_height, stride_height,
+                                        padding_height);
+    int output_width =
+        get_output_size(img_width, block_width, stride_width, padding_width);
 
-    get_blockexpand_output_shape(img_height, img_width, block_height,
-                                 block_width, stride_height, stride_width,
-                                 padding_height, padding_width, output_height,
-                                 output_width);
-
-    // The result of im2col is [output_height, output_width,
-    // inputChannels, filterHeight, filterWidth], and it is easy to
-    // reshape into [seqLength, stepSize], where seqLength is equal
-    // output_height * output_width, stepSize is equal
-    // input_channels * blockHeight * blockWidth
-    ctx->SetOutputDim(
-        "Out", {N, output_height, output_width, C, block_height, block_width});
-
-    // ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
+                              img_channels * block_height * block_width});
+    // TODO(wanghaoshuang): cal lod in complie time
   }
 };
 
@@ -79,28 +69,69 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     W: width
 )DOC");
     AddOutput("Out", "(LodTensor)The output data of block_expand op,");
-    AddAttr<int>("blockHeight", "(int)height of block.");
-    AddAttr<int>("blockWidth", "(int)width of block.");
-    AddAttr<int>("strideHeight", "(int)height of stride.");
-    AddAttr<int>("strideWidth", "(int)width of stride.");
-    AddAttr<int>("paddingHeight", "(int)height of padding.");
-    AddAttr<int>("paddingWidth", "(int)width of padding.");
+    AddAttr<int>("block_height", "(int)height of block.");
+    AddAttr<int>("block_width", "(int)width of block.");
+    AddAttr<int>("stride_height", "(int)height of stride.");
+    AddAttr<int>("stride_width", "(int)width of stride.");
+    AddAttr<int>("padding_height", "(int)height of padding.");
+    AddAttr<int>("padding_width", "(int)width of padding.");
     AddComment(R"DOC(
 Expand feature map to minibatch matrix.
 - matirx height is: output_height * output_width
-- matrix width is: blockHeight * blockWidth * channels
-
-output_height = 
-    1 + (2 * paddingHeight + img_height - blockHeight + strideHeight - 1) /
-            strideHeight;
-output_width = 
-    1 + (2 * paddingWidth + img_width - blockWidth + strideWidth - 1) /
-            strideWidth;
-
-The expand method is the same with ExpandConvLayer, but saved the transposed
-value. After expanding, The number of time steps are output_height * output_width
-and the dimension of each time step is blockHeight * blockWidth * channels.
-This layer can be used after convolution neural network, and before recurrent neural network.
+- matrix width is: block_height * block_width * channels
+
+output_height =
+    1 + (2 * padding_height + img_height - block_height + stride_height - 1) /
+            stride_height;
+output_width =
+    1 + (2 * padding_width + img_width - block_width + stride_width - 1) /
+            stride_width;
+
+After expanding, The number of time steps are output_height * output_width
+and the dimension of each time step is block_height * block_width * channels.
+This op can be used after convolution neural network, and before recurrent neural network.
+
+Given:
+
+x = [[[[ 6.  2.  1.]
+       [ 8.  3.  5.]
+       [ 0.  2.  6.]]
+
+      [[ 2.  4.  4.]
+       [ 6.  3.  0.]
+       [ 6.  4.  7.]]]
+
+     [[[ 6.  7.  1.]
+       [ 5.  7.  9.]
+       [ 2.  4.  8.]]
+
+      [[ 1.  2.  1.]
+       [ 1.  3.  5.]
+       [ 9.  0.  8.]]]]
+x.dims = {2, 2, 3, 3}
+
+And:
+
+block_height = 2
+block_width = 2
+stride_height = 1
+stride_width = 1
+padding_height = 0
+padding_width = 0
+
+Then:
+
+output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
+               [ 2.  1.  3.  5.  4.  4.  3.  0.]
+               [ 8.  3.  0.  2.  6.  3.  6.  4.]
+               [ 3.  5.  2.  6.  3.  0.  4.  7.]
+               [ 6.  7.  5.  7.  1.  2.  1.  3.]
+               [ 7.  1.  7.  9.  2.  1.  3.  5.]
+               [ 5.  7.  2.  4.  1.  3.  9.  0.]
+               [ 7.  9.  4.  8.  3.  5.  0.  8.]]
+output.dims = {8, 9}
+output.lod = [[0, 4, 8]]
+
 )DOC");
   }
 };
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index aa0db2705c..022dc3a123 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -23,20 +23,9 @@
 namespace paddle {
 namespace operators {
 
-inline void get_blockexpand_output_shape(int img_height, int img_width,
-                                         int block_height, int block_width,
-                                         int stride_height, int stride_width,
-                                         int padding_height, int padding_width,
-                                         int& outputHeight, int& outputWidth) {
-  outputHeight =
-      1 +
-      (img_height + 2 * padding_height - block_height + stride_height - 1) /
-          stride_height;
-
-  outputWidth =
-      1 +
-      (img_width + 2 * padding_width - block_width + stride_width - 1) /
-          stride_width;
+inline int get_output_size(int img_size, int block_size, int stride,
+                           int padding) {
+  return (1 + (img_size + 2 * padding - block_size + stride - 1) / stride);
 }
 
 template <typename Place, typename T>
@@ -45,40 +34,54 @@ class BlockExpandKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
     const Tensor* in = ctx.Input<Tensor>("X");
-    Tensor* out = ctx.Output<Tensor>("Out");
+    LoDTensor* out = ctx.Output<LoDTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     auto in_dim = in->dims();
-    int N = in_dim[0];
-    int C = in_dim[1];
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
-
-    int block_height = ctx.Attr<int>("blockHeight");
-    int block_width = ctx.Attr<int>("blockWidth");
-    int stride_height = ctx.Attr<int>("strideHeight");
-    int stride_width = ctx.Attr<int>("strideWidth");
-    int padding_height = ctx.Attr<int>("paddingHeight");
-    int padding_width = ctx.Attr<int>("paddingWidth");
-
-    int outputHeight = 0;
-    int outputWidth = 0;
-
-    get_blockexpand_output_shape(
-        img_height, img_width, block_height, block_width, stride_height,
-        stride_width, padding_height, padding_width, outputHeight, outputWidth);
-
-    std::vector<int> stride({stride_height, stride_width});
-    std::vector<int> padding({padding_height, padding_width});
-
-    for (int i = 0; i < N; i++) {
-      Tensor src = in->Slice(i, i + 1).Resize({C, img_height, img_width});
-      Tensor dst = out->Slice(i, i + 1).Resize(
-          {outputHeight, outputWidth, C, block_height, block_width});
+    int block_height = ctx.Attr<int>("block_height");
+    int block_width = ctx.Attr<int>("block_width");
+    int stride_height = ctx.Attr<int>("stride_height");
+    int stride_width = ctx.Attr<int>("stride_width");
+    int padding_height = ctx.Attr<int>("padding_height");
+    int padding_width = ctx.Attr<int>("padding_width");
+
+    int output_height = get_output_size(img_height, block_height, stride_height,
+                                        padding_height);
+    int output_width =
+        get_output_size(img_width, block_width, stride_width, padding_width);
+
+    const std::vector<int> dilations({1, 1});
+    const std::vector<int> strides(
+        {stride_height, stride_width, stride_height, stride_width});
+    const std::vector<int> paddings(
+        {padding_height, padding_width, padding_height, padding_width});
+
+    auto out_dims = out->dims();
+    out->Resize({batch_size, out->numel() / batch_size});
+    for (int i = 0; i < batch_size; i++) {
+      const Tensor src =
+          in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+      Tensor dst = out->Slice(i, i + 1).Resize({output_height, output_width,
+                                                img_channels, block_height,
+                                                block_width});
 
       math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), src, stride, padding, &dst);
+      f(ctx.device_context(), src, dilations, strides, paddings, &dst);
     }
+    out->Resize(out_dims);
+
+    // set lod information
+    // TODO(wanghaoshuang): Move this to InferShape
+    framework::LoD lod(1);
+    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
+      lod[0].push_back(offset);
+      offset += output_height * output_width;
+    }
+    out->set_lod(lod);
   }
 };
 
@@ -88,7 +91,8 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using namespace framework;
     auto* in = ctx.Input<Tensor>("X");
-    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* d_out =
+        const_cast<Tensor*>(ctx.Input<Tensor>(framework::GradVarName("Out")));
     auto* d_x = ctx.Output<Tensor>(GradVarName("X"));
     d_x->mutable_data<T>(ctx.GetPlace());
 
@@ -96,36 +100,40 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
     x_v.device(ctx.GetEigenDevice<Place>()) = x_v.constant(0.0);
 
     auto in_dim = in->dims();
-    int N = in_dim[0];
-    int C = in_dim[1];
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    int block_height = ctx.Attr<int>("blockHeight");
-    int block_width = ctx.Attr<int>("blockWidth");
-    int stride_height = ctx.Attr<int>("strideHeight");
-    int stride_width = ctx.Attr<int>("strideWidth");
-    int padding_height = ctx.Attr<int>("paddingHeight");
-    int padding_width = ctx.Attr<int>("paddingWidth");
-
-    int outputHeight = 0;
-    int outputWidth = 0;
-
-    get_blockexpand_output_shape(
-        img_height, img_width, block_height, block_width, stride_height,
-        stride_width, padding_height, padding_width, outputHeight, outputWidth);
-
-    std::vector<int> stride({stride_height, stride_width});
-    std::vector<int> padding({padding_height, padding_width});
-    // std::vector<int> stride({stride_height, stride_width});
-
-    for (int i = 0; i < N; i++) {
-      Tensor dst = d_x->Slice(i, i + 1).Resize({C, img_height, img_width});
-      Tensor src = d_out->Slice(i, i + 1).Resize(
-          {outputHeight, outputWidth, C, block_height, block_width});
+    int block_height = ctx.Attr<int>("block_height");
+    int block_width = ctx.Attr<int>("block_width");
+    int stride_height = ctx.Attr<int>("stride_height");
+    int stride_width = ctx.Attr<int>("stride_width");
+    int padding_height = ctx.Attr<int>("padding_height");
+    int padding_width = ctx.Attr<int>("padding_width");
+    int output_height = get_output_size(img_height, block_height, stride_height,
+                                        padding_height);
+    int output_width =
+        get_output_size(img_width, block_width, stride_width, padding_width);
+
+    const std::vector<int> dilations({1, 1});
+    const std::vector<int> strides(
+        {stride_height, stride_width, stride_height, stride_width});
+    const std::vector<int> paddings(
+        {padding_height, padding_width, padding_height, padding_width});
+
+    auto d_out_dims = d_out->dims();
+    d_out->Resize({batch_size, d_out->numel() / batch_size});
+    for (int i = 0; i < batch_size; i++) {
+      Tensor dst =
+          d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+      const Tensor src = d_out->Slice(i, i + 1).Resize(
+          {output_height, output_width, img_channels, block_height,
+           block_width});
       math::Col2ImFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), dst, stride, padding, &src);
+      f(ctx.device_context(), src, dilations, strides, paddings, &dst);
     }
+    d_out->Resize(d_out_dims);
   }
 };
 
diff --git a/python/paddle/v2/fluid/tests/test_block_expand_op.py b/python/paddle/v2/fluid/tests/test_block_expand_op.py
index b31ed53f4c..424bc7dc6e 100644
--- a/python/paddle/v2/fluid/tests/test_block_expand_op.py
+++ b/python/paddle/v2/fluid/tests/test_block_expand_op.py
@@ -4,20 +4,20 @@ from op_test import OpTest
 
 
 def get_output_shape(attrs, x):
-    img_height = x.shape[1]
-    img_width = x.shape[2]
+    img_height = x.shape[2]
+    img_width = x.shape[3]
 
-    padding_height = attrs['paddingHeight']
-    padding_width = attrs['paddingWidth']
-    block_height = attrs['blockHeight']
-    block_width = attrs['blockWidth']
-    stride_height = attrs['strideHeight']
-    stride_width = attrs['strideWidth']
+    padding_height = attrs['padding_height']
+    padding_width = attrs['padding_width']
+    block_height = attrs['block_height']
+    block_width = attrs['block_width']
+    stride_height = attrs['stride_height']
+    stride_width = attrs['stride_width']
 
     output_height = \
       1 +  \
       (img_height + 2 * padding_height - block_height + stride_height - 1) / \
-          strideHeight
+          stride_height
 
     output_width = \
       1 + \
@@ -42,10 +42,10 @@ def im2col(attrs, im, col):
     filter_height = col.shape[3]
     filter_width = col.shape[4]
 
-    stride_height = attrs['strideHeight']
-    stride_width = attrs['strideWidth']
-    padding_height = attrs['paddingHeight']
-    padding_width = attrs['paddingWidth']
+    stride_height = attrs['stride_height']
+    stride_width = attrs['stride_width']
+    padding_height = attrs['padding_height']
+    padding_width = attrs['padding_width']
 
     for col_row_idx in range(0, output_height):
         for col_col_idx in range(0, output_width):
@@ -73,83 +73,51 @@ def im2col(attrs, im, col):
                                     im_row_offset][im_col_offset]
 
 
-def col2img(attrs, col, img):
-    """
-    img: {CHW}
-    col:
-        {output_height, outputWidth, inputChannels, filterHeight, filterWidth}
-    """
-    input_channels = im.shape[0]
-    input_height = im.shape[1]
-    input_width = im.shape[2]
-
-    output_height = col.shape[0]
-    output_width = col.shape[1]
-    filter_height = col.shape[3]
-    filter_width = col.shape[4]
+def block_expand(inputs, attrs):
+    output_height, output_width = get_output_shape(attrs, inputs)
+    img_channels = inputs.shape[1]
+    batch_size = inputs.shape[0]
+    out = np.zeros([
+        batch_size, output_height, output_width, img_channels,
+        attrs['block_height'], attrs['block_width']
+    ]).astype("float32")
 
-    stride_height = attrs['strideHeight']
-    stride_width = attrs['strideWidth']
-    padding_height = attrs['paddingHeight']
-    padding_width = attrs['paddingWidth']
+    for i in range(len(inputs)):
+        im2col(attrs, inputs[i], out[i])
 
-    for col_row_idx in range(0, output_height):
-        for col_col_idx in range(0, output_width):
-            for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filter_height):
-                    for filter_col_idx in range(0, filter_width):
-                        im_row_offset = \
-                            col_row_idx * stride_height + filter_row_idx - padding_height
-                        im_col_offset = \
-                            col_col_idx * stride_width + filter_col_idx - padding_width
-                        if (im_row_offset >= 0 and
-                                im_row_offset < input_height and
-                                im_col_offset >= 0 and
-                                im_col_offset < input_width):
-                            im[channel][im_row_offset][im_col_offset] = \
-                                col[col_row_idx][col_col_idx][channel][filter_row_idx][filter_col_idx]
-
-
-def get_input_data(C, H, W):
-    x = np.random.uniform(0.1, 1, [C, H, W]).astype("float32")
-    for c in range(0, C):
-        for h in range(0, H):
-            for w in range(0, W):
-                #x[c][h][w] = c * H * W + h *W + w
-                x[c][h][w] = 0.2 + 0.01 * (c * H * W + h * W + w)
-        return x
+    out = out.reshape([
+        batch_size * output_height * output_width,
+        img_channels * attrs['block_height'] * attrs['block_width']
+    ])
+    return out
 
 
 class TestBlockExpandOp(OpTest):
-    def setUp(self):
-        C = 3
-        H = 4
-        W = 4
-        x = get_input_data(C, H, W)
-
-        attrs = {
-            'blockHeight': 2,
-            'blockWidth': 2,
-            'strideHeight': 1,
-            'strideWidth': 1,
-            'paddingHeight': 1,
-            'paddingWidth': 1,
+    def config(self):
+        self.batch_size = 1
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 4
+        self.attrs = {
+            'block_height': 2,
+            'block_width': 2,
+            'stride_height': 1,
+            'stride_width': 1,
+            'padding_height': 1,
+            'padding_width': 1,
         }
 
-        output_height, output_width = get_output_shape(attrs, x)
-        out = np.random.uniform(0.1, 1,\
-                    [output_height, output_width, x.shape[0], \
-                     attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
-
+    def setUp(self):
+        self.config()
         self.op_type = "block_expand"
-        self.inputs = {'X': x.reshape(1, C, H, W)}
-        self.attrs = attrs
+        #x = np.random.uniform(0.1, 1,
+        x = np.random.randint(0, 10, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
 
-        im2col(attrs, x, out)
-        self.outputs = {
-            'Out':out.reshape(1, output_height, output_width, x.shape[0], \
-                     attrs['blockHeight'], attrs['blockWidth'])
-            }
+        out = block_expand(x, self.attrs)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
@@ -158,42 +126,52 @@ class TestBlockExpandOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestBlockExpandOp2(OpTest):
-    def setUp(self):
-        C = 3
-        H = 4
-        W = 5
-        x = get_input_data(C, H, W)
-
-        attrs = {
-            'blockHeight': 2,
-            'blockWidth': 1,
-            'strideHeight': 2,
-            'strideWidth': 1,
-            'paddingHeight': 2,
-            'paddingWidth': 1,
+class TestBlockExpandOpCase2(TestBlockExpandOp):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'block_height': 2,
+            'block_width': 1,
+            'stride_height': 2,
+            'stride_width': 1,
+            'padding_height': 2,
+            'padding_width': 1,
         }
 
-        output_height, output_width = get_output_shape(attrs, x)
-        out = np.random.uniform(0.1, 1,\
-                    [output_height, output_width, x.shape[0], \
-                     attrs['blockHeight'], attrs['blockWidth']]).astype("float32")
-
-        self.op_type = "block_expand"
-        self.inputs = {'X': x.reshape(1, C, H, W)}
-        self.attrs = attrs
 
-        im2col(attrs, x, out)
-        self.outputs = {
-            'Out':out.reshape(1, output_height, output_width, x.shape[0], \
-                     attrs['blockHeight'], attrs['blockWidth'])
-            }
+class TestBlockExpandOpCase3(TestBlockExpandOp):
+    def config(self):
+        self.batch_size = 3
+        self.img_channels = 1
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'block_height': 2,
+            'block_width': 1,
+            'stride_height': 2,
+            'stride_width': 1,
+            'padding_height': 2,
+            'padding_width': 0,
+        }
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
+class TestBlockExpandOpCase4(TestBlockExpandOp):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 2
+        self.img_height = 3
+        self.img_width = 3
+        self.attrs = {
+            'block_height': 2,
+            'block_width': 2,
+            'stride_height': 1,
+            'stride_width': 1,
+            'padding_height': 0,
+            'padding_width': 0,
+        }
 
 
 if __name__ == '__main__':

From 92baa885a0bd6f752fbda290aa69b698b90bd53a Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 17 Jan 2018 09:56:17 +0800
Subject: [PATCH 13/18] Fix code style

---
 paddle/operators/block_expand_op.cc | 2 --
 paddle/operators/block_expand_op.h  | 7 ++++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index 317a43bb7b..bef82183b8 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -23,7 +23,6 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    using namespace framework;
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input of BlockExpandOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -142,7 +141,6 @@ class BlockExpandGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    using namespace framework;
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) shouldn't be null.");
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 022dc3a123..2e4f0cb6f1 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -23,6 +23,9 @@
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
 inline int get_output_size(int img_size, int block_size, int stride,
                            int padding) {
   return (1 + (img_size + 2 * padding - block_size + stride - 1) / stride);
@@ -32,7 +35,6 @@ template <typename Place, typename T>
 class BlockExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using namespace framework;
     const Tensor* in = ctx.Input<Tensor>("X");
     LoDTensor* out = ctx.Output<LoDTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
@@ -89,11 +91,10 @@ template <typename Place, typename T>
 class BlockExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using namespace framework;
     auto* in = ctx.Input<Tensor>("X");
     Tensor* d_out =
         const_cast<Tensor*>(ctx.Input<Tensor>(framework::GradVarName("Out")));
-    auto* d_x = ctx.Output<Tensor>(GradVarName("X"));
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto x_v = framework::EigenVector<T>::Flatten(*d_x);

From 09adb769037b34fbe8a50fd48bc3284f13456f3a Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 17 Jan 2018 11:15:54 +0800
Subject: [PATCH 14/18] Fix code style

---
 paddle/operators/block_expand_op.cc | 21 ++++++++++-----------
 paddle/operators/block_expand_op.cu |  9 +++++----
 paddle/operators/block_expand_op.h  | 17 ++++++++++-------
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/block_expand_op.cc
index bef82183b8..f9b75ffee7 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/block_expand_op.cc
@@ -57,16 +57,14 @@ class BlockExpandOp : public framework::OperatorWithKernel {
 
 class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BlockExpandOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  BlockExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", R"DOC(
-(Tensor)The input tensor has NCHW format.
-    N: batch size
-    C: channels
-    H: height
-    W: width
-)DOC");
+    AddInput("X",
+             "(Tensor)The input tensor has NCHW format."
+             "N: batch size"
+             "C: channels"
+             "H: height"
+             "W: width");
     AddOutput("Out", "(LodTensor)The output data of block_expand op,");
     AddAttr<int>("block_height", "(int)height of block.");
     AddAttr<int>("block_width", "(int)width of block.");
@@ -155,7 +153,8 @@ namespace ops = paddle::operators;
 REGISTER_OP(block_expand, ops::BlockExpandOp, ops::BlockExpandOpMaker,
             block_expand_grad, ops::BlockExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
-    block_expand, ops::BlockExpandKernel<paddle::platform::CPUPlace, float>);
+    block_expand,
+    ops::BlockExpandKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     block_expand_grad,
-    ops::BlockExpandGradKernel<paddle::platform::CPUPlace, float>);
+    ops::BlockExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/block_expand_op.cu b/paddle/operators/block_expand_op.cu
index 492ac0c9b2..c17b113807 100644
--- a/paddle/operators/block_expand_op.cu
+++ b/paddle/operators/block_expand_op.cu
@@ -17,8 +17,9 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
-    block_expand, ops::BlockExpandKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    block_expand,
+    ops::BlockExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     block_expand_grad,
-    ops::BlockExpandGradKernel<paddle::platform::GPUPlace, float>);
+    ops::BlockExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/block_expand_op.h
index 2e4f0cb6f1..72760fb23c 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/block_expand_op.h
@@ -31,7 +31,7 @@ inline int get_output_size(int img_size, int block_size, int stride,
   return (1 + (img_size + 2 * padding - block_size + stride - 1) / stride);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BlockExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -71,8 +71,9 @@ class BlockExpandKernel : public framework::OpKernel<T> {
                                                 img_channels, block_height,
                                                 block_width});
 
-      math::Im2ColFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), src, dilations, strides, paddings, &dst);
+      math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      f(dev_ctx, src, dilations, strides, paddings, &dst);
     }
     out->Resize(out_dims);
 
@@ -87,7 +88,7 @@ class BlockExpandKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BlockExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -98,7 +99,8 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto x_v = framework::EigenVector<T>::Flatten(*d_x);
-    x_v.device(ctx.GetEigenDevice<Place>()) = x_v.constant(0.0);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    x_v.device(place) = x_v.constant(0.0);
 
     auto in_dim = in->dims();
     int batch_size = in_dim[0];
@@ -131,8 +133,9 @@ class BlockExpandGradKernel : public framework::OpKernel<T> {
       const Tensor src = d_out->Slice(i, i + 1).Resize(
           {output_height, output_width, img_channels, block_height,
            block_width});
-      math::Col2ImFunctor<math::ColFormat::kOCF, Place, T> f;
-      f(ctx.device_context(), src, dilations, strides, paddings, &dst);
+      math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      f(dev_ctx, src, dilations, strides, paddings, &dst);
     }
     d_out->Resize(d_out_dims);
   }

From fe45f2115ff37cb4ec2ee2bd53692b5eaa422613 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 17 Jan 2018 19:01:11 +0800
Subject: [PATCH 15/18] 1. Rename 'block_expand' to im2sequence 2. Refine code
 and doc

---
 .../{block_expand_op.cc => im2sequence_op.cc} | 30 +++++++++----------
 .../{block_expand_op.cu => im2sequence_op.cu} | 10 +++----
 .../{block_expand_op.h => im2sequence_op.h}   | 16 ++++++----
 ...ck_expand_op.py => test_im2sequence_op.py} | 30 +++++++++++++------
 4 files changed, 51 insertions(+), 35 deletions(-)
 rename paddle/operators/{block_expand_op.cc => im2sequence_op.cc} (84%)
 rename paddle/operators/{block_expand_op.cu => im2sequence_op.cu} (77%)
 rename paddle/operators/{block_expand_op.h => im2sequence_op.h} (92%)
 rename python/paddle/v2/fluid/tests/{test_block_expand_op.py => test_im2sequence_op.py} (85%)

diff --git a/paddle/operators/block_expand_op.cc b/paddle/operators/im2sequence_op.cc
similarity index 84%
rename from paddle/operators/block_expand_op.cc
rename to paddle/operators/im2sequence_op.cc
index f9b75ffee7..9b2397bdc8 100644
--- a/paddle/operators/block_expand_op.cc
+++ b/paddle/operators/im2sequence_op.cc
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/block_expand_op.h"
+#include "paddle/operators/im2sequence_op.h"
 
 namespace paddle {
 namespace operators {
 
-class BlockExpandOp : public framework::OperatorWithKernel {
+class Im2SequenceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input of BlockExpandOp should not be null.");
+                   "Input(X) of Im2SequenceOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output of BlockExpandOp op should not be null.");
+                   "Output(Out) of Im2SequenceOp op should not be null.");
 
     auto in_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(in_dim.size(), 4,
@@ -55,9 +55,9 @@ class BlockExpandOp : public framework::OperatorWithKernel {
   }
 };
 
-class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BlockExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+  Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor)The input tensor has NCHW format."
@@ -65,7 +65,7 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
              "C: channels"
              "H: height"
              "W: width");
-    AddOutput("Out", "(LodTensor)The output data of block_expand op,");
+    AddOutput("Out", "(LodTensor)The output data of im2sequence op,");
     AddAttr<int>("block_height", "(int)height of block.");
     AddAttr<int>("block_width", "(int)width of block.");
     AddAttr<int>("stride_height", "(int)height of stride.");
@@ -73,7 +73,7 @@ class BlockExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("padding_height", "(int)height of padding.");
     AddAttr<int>("padding_width", "(int)width of padding.");
     AddComment(R"DOC(
-Expand feature map to minibatch matrix.
+Convert feature map to minibatch matrix.
 - matirx height is: output_height * output_width
 - matrix width is: block_height * block_width * channels
 
@@ -133,7 +133,7 @@ output.lod = [[0, 4, 8]]
   }
 };
 
-class BlockExpandGradOp : public framework::OperatorWithKernel {
+class Im2SequenceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -150,11 +150,11 @@ class BlockExpandGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(block_expand, ops::BlockExpandOp, ops::BlockExpandOpMaker,
-            block_expand_grad, ops::BlockExpandGradOp);
+REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
+            im2sequence_grad, ops::Im2SequenceGradOp);
 REGISTER_OP_CPU_KERNEL(
-    block_expand,
-    ops::BlockExpandKernel<paddle::platform::CPUDeviceContext, float>);
+    im2sequence,
+    ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
-    block_expand_grad,
-    ops::BlockExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
+    im2sequence_grad,
+    ops::Im2SequenceGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/block_expand_op.cu b/paddle/operators/im2sequence_op.cu
similarity index 77%
rename from paddle/operators/block_expand_op.cu
rename to paddle/operators/im2sequence_op.cu
index c17b113807..9db7529112 100644
--- a/paddle/operators/block_expand_op.cu
+++ b/paddle/operators/im2sequence_op.cu
@@ -13,13 +13,13 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/block_expand_op.h"
+#include "paddle/operators/im2sequence_op.h"
 
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
-    block_expand,
-    ops::BlockExpandKernel<paddle::platform::CUDADeviceContext, float>);
+    im2sequence,
+    ops::Im2SequenceKernel<paddle::platform::CUDADeviceContext, float>);
 REGISTER_OP_CUDA_KERNEL(
-    block_expand_grad,
-    ops::BlockExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
+    im2sequence_grad,
+    ops::Im2SequenceGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/block_expand_op.h b/paddle/operators/im2sequence_op.h
similarity index 92%
rename from paddle/operators/block_expand_op.h
rename to paddle/operators/im2sequence_op.h
index 72760fb23c..85d6cac444 100644
--- a/paddle/operators/block_expand_op.h
+++ b/paddle/operators/im2sequence_op.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/operators/math/math_function.h"
-
+#include "paddle/framework/data_layout.h"
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -32,13 +32,16 @@ inline int get_output_size(int img_size, int block_size, int stride,
 }
 
 template <typename DeviceContext, typename T>
-class BlockExpandKernel : public framework::OpKernel<T> {
+class Im2SequenceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const Tensor* in = ctx.Input<Tensor>("X");
     LoDTensor* out = ctx.Output<LoDTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
-
+    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
+    // being available for python API
+    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
+    //                  "Input(X) layout must be NCHW");
     auto in_dim = in->dims();
     int batch_size = in_dim[0];
     int img_channels = in_dim[1];
@@ -80,8 +83,9 @@ class BlockExpandKernel : public framework::OpKernel<T> {
     // set lod information
     // TODO(wanghaoshuang): Move this to InferShape
     framework::LoD lod(1);
+    lod[0].reserve(batch_size + 1);
     for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
-      lod[0].push_back(offset);
+      lod[0][i] = offset;
       offset += output_height * output_width;
     }
     out->set_lod(lod);
@@ -89,7 +93,7 @@ class BlockExpandKernel : public framework::OpKernel<T> {
 };
 
 template <typename DeviceContext, typename T>
-class BlockExpandGradKernel : public framework::OpKernel<T> {
+class Im2SequenceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<Tensor>("X");
diff --git a/python/paddle/v2/fluid/tests/test_block_expand_op.py b/python/paddle/v2/fluid/tests/test_im2sequence_op.py
similarity index 85%
rename from python/paddle/v2/fluid/tests/test_block_expand_op.py
rename to python/paddle/v2/fluid/tests/test_im2sequence_op.py
index 424bc7dc6e..cd1b2164f0 100644
--- a/python/paddle/v2/fluid/tests/test_block_expand_op.py
+++ b/python/paddle/v2/fluid/tests/test_im2sequence_op.py
@@ -1,11 +1,24 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
 
 
-def get_output_shape(attrs, x):
-    img_height = x.shape[2]
-    img_width = x.shape[3]
+def get_output_shape(attrs, in_shape):
+    img_height = in_shape[2]
+    img_width = in_shape[3]
 
     padding_height = attrs['padding_height']
     padding_width = attrs['padding_width']
@@ -73,8 +86,8 @@ def im2col(attrs, im, col):
                                     im_row_offset][im_col_offset]
 
 
-def block_expand(inputs, attrs):
-    output_height, output_width = get_output_shape(attrs, inputs)
+def Im2Sequence(inputs, attrs):
+    output_height, output_width = get_output_shape(attrs, inputs.shape)
     img_channels = inputs.shape[1]
     batch_size = inputs.shape[0]
     out = np.zeros([
@@ -109,13 +122,12 @@ class TestBlockExpandOp(OpTest):
 
     def setUp(self):
         self.config()
-        self.op_type = "block_expand"
-        #x = np.random.uniform(0.1, 1,
-        x = np.random.randint(0, 10, [
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
             self.batch_size, self.img_channels, self.img_height, self.img_width
         ]).astype("float32")
 
-        out = block_expand(x, self.attrs)
+        out = Im2Sequence(x, self.attrs)
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
 

From 500e29a4a4a8d6e70f79cc109f5f43709a4ad605 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 22 Jan 2018 13:11:45 +0800
Subject: [PATCH 16/18] 1. Reduce attributes 2. Rename 'get_output_size' to
 'OutputSize' 3. Remove redundant whitespace char.

---
 paddle/operators/im2sequence_op.cc | 59 ++++++++++++++----------------
 paddle/operators/im2sequence_op.h  | 56 +++++++++++-----------------
 2 files changed, 49 insertions(+), 66 deletions(-)

diff --git a/paddle/operators/im2sequence_op.cc b/paddle/operators/im2sequence_op.cc
index 9b2397bdc8..9c9802c043 100644
--- a/paddle/operators/im2sequence_op.cc
+++ b/paddle/operators/im2sequence_op.cc
@@ -30,28 +30,24 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
 
     auto in_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(in_dim.size(), 4,
-                      "Input(X) format  must be 4D tensor, eg., NCHW.");
+                      "Input(X) format must be 4D tensor, eg., NCHW.");
 
-    int block_height = ctx->Attrs().Get<int>("block_height");
-    int block_width = ctx->Attrs().Get<int>("block_width");
-    int stride_height = ctx->Attrs().Get<int>("stride_height");
-    int stride_width = ctx->Attrs().Get<int>("stride_width");
-    int padding_height = ctx->Attrs().Get<int>("padding_height");
-    int padding_width = ctx->Attrs().Get<int>("padding_width");
+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
     int batch_size = in_dim[0];
     int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    int output_height = get_output_size(img_height, block_height, stride_height,
-                                        padding_height);
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
     int output_width =
-        get_output_size(img_width, block_width, stride_width, padding_width);
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
 
     ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
-                              img_channels * block_height * block_width});
-    // TODO(wanghaoshuang): cal lod in complie time
+                              img_channels * kernels[0] * kernels[1]});
   }
 };
 
@@ -66,26 +62,30 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
              "H: height"
              "W: width");
     AddOutput("Out", "(LodTensor)The output data of im2sequence op,");
-    AddAttr<int>("block_height", "(int)height of block.");
-    AddAttr<int>("block_width", "(int)width of block.");
-    AddAttr<int>("stride_height", "(int)height of stride.");
-    AddAttr<int>("stride_width", "(int)width of stride.");
-    AddAttr<int>("padding_height", "(int)height of padding.");
-    AddAttr<int>("padding_width", "(int)width of padding.");
+    AddAttr<std::vector<int>>("kernels",
+                              "(vector<int>), the "
+                              "kernels(kernel_height, kernel_width)")
+        AddAttr<std::vector<int>>("strides",
+                                  "(vector<int> default:{1, 1}), the "
+                                  "strides(h_stride, w_stride)")
+            .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector<int> default:{0, 0, 0, 0}), the "
+                              "paddings(up_pad, left_pad, down_pad, right_pad)")
+        .SetDefault({0, 0, 0, 0});
     AddComment(R"DOC(
-Convert feature map to minibatch matrix.
-- matirx height is: output_height * output_width
-- matrix width is: block_height * block_width * channels
+This op uses kernels to scan images and converts these images to sequences.
+After expanding, The number of time steps are output_height * output_width
+and the dimension of each time step is kernel_height * kernel_width * channels,
+in which:
 
 output_height =
-    1 + (2 * padding_height + img_height - block_height + stride_height - 1) /
+    1 + (padding_height + padding_down + img_height - kernel_height + stride_height - 1) /
             stride_height;
 output_width =
-    1 + (2 * padding_width + img_width - block_width + stride_width - 1) /
+    1 + (padding_left + padding+right + img_width - kernel_width + stride_width - 1) /
             stride_width;
 
-After expanding, The number of time steps are output_height * output_width
-and the dimension of each time step is block_height * block_width * channels.
 This op can be used after convolution neural network, and before recurrent neural network.
 
 Given:
@@ -109,12 +109,9 @@ x.dims = {2, 2, 3, 3}
 
 And:
 
-block_height = 2
-block_width = 2
-stride_height = 1
-stride_width = 1
-padding_height = 0
-padding_width = 0
+kernels = [2, 2]
+strides = [1, 1]
+paddings = [0, 0, 0, 0]
 
 Then:
 
diff --git a/paddle/operators/im2sequence_op.h b/paddle/operators/im2sequence_op.h
index 85d6cac444..352d290b1b 100644
--- a/paddle/operators/im2sequence_op.h
+++ b/paddle/operators/im2sequence_op.h
@@ -26,9 +26,11 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-inline int get_output_size(int img_size, int block_size, int stride,
-                           int padding) {
-  return (1 + (img_size + 2 * padding - block_size + stride - 1) / stride);
+inline int OutputSize(int input_size, int filter_size, int padding_0,
+                      int padding_1, int stride) {
+  const int output_size =
+      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
+  return output_size;
 }
 
 template <typename DeviceContext, typename T>
@@ -47,32 +49,24 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
     int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
-    int block_height = ctx.Attr<int>("block_height");
-    int block_width = ctx.Attr<int>("block_width");
-    int stride_height = ctx.Attr<int>("stride_height");
-    int stride_width = ctx.Attr<int>("stride_width");
-    int padding_height = ctx.Attr<int>("padding_height");
-    int padding_width = ctx.Attr<int>("padding_width");
-
-    int output_height = get_output_size(img_height, block_height, stride_height,
-                                        padding_height);
+
+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    int output_height =
+        OutputSize(img_height, kernels[0], paddings[0], paddings[2] strides[0]);
     int output_width =
-        get_output_size(img_width, block_width, stride_width, padding_width);
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
 
     const std::vector<int> dilations({1, 1});
-    const std::vector<int> strides(
-        {stride_height, stride_width, stride_height, stride_width});
-    const std::vector<int> paddings(
-        {padding_height, padding_width, padding_height, padding_width});
 
     auto out_dims = out->dims();
     out->Resize({batch_size, out->numel() / batch_size});
     for (int i = 0; i < batch_size; i++) {
       const Tensor src =
           in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      Tensor dst = out->Slice(i, i + 1).Resize({output_height, output_width,
-                                                img_channels, block_height,
-                                                block_width});
+      Tensor dst = out->Slice(i, i + 1).Resize(
+          {output_height, output_width, img_channels, kernels[0], kernels[1]});
 
       math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
@@ -112,22 +106,15 @@ class Im2SequenceGradKernel : public framework::OpKernel<T> {
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    int block_height = ctx.Attr<int>("block_height");
-    int block_width = ctx.Attr<int>("block_width");
-    int stride_height = ctx.Attr<int>("stride_height");
-    int stride_width = ctx.Attr<int>("stride_width");
-    int padding_height = ctx.Attr<int>("padding_height");
-    int padding_width = ctx.Attr<int>("padding_width");
-    int output_height = get_output_size(img_height, block_height, stride_height,
-                                        padding_height);
+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
     int output_width =
-        get_output_size(img_width, block_width, stride_width, padding_width);
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
 
     const std::vector<int> dilations({1, 1});
-    const std::vector<int> strides(
-        {stride_height, stride_width, stride_height, stride_width});
-    const std::vector<int> paddings(
-        {padding_height, padding_width, padding_height, padding_width});
 
     auto d_out_dims = d_out->dims();
     d_out->Resize({batch_size, d_out->numel() / batch_size});
@@ -135,8 +122,7 @@ class Im2SequenceGradKernel : public framework::OpKernel<T> {
       Tensor dst =
           d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
       const Tensor src = d_out->Slice(i, i + 1).Resize(
-          {output_height, output_width, img_channels, block_height,
-           block_width});
+          {output_height, output_width, img_channels, kernels[0], kernels[1]});
       math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
       f(dev_ctx, src, dilations, strides, paddings, &dst);

From 3a48282e61750688c02ab3330b7373b37d81ee74 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 22 Jan 2018 15:07:08 +0800
Subject: [PATCH 17/18] Fix unitest

---
 paddle/operators/im2sequence_op.cc            | 10 +--
 paddle/operators/im2sequence_op.h             | 16 ++--
 .../v2/fluid/tests/test_im2sequence_op.py     | 73 +++++++------------
 3 files changed, 38 insertions(+), 61 deletions(-)

diff --git a/paddle/operators/im2sequence_op.cc b/paddle/operators/im2sequence_op.cc
index 9c9802c043..1854fc384c 100644
--- a/paddle/operators/im2sequence_op.cc
+++ b/paddle/operators/im2sequence_op.cc
@@ -64,11 +64,11 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(LodTensor)The output data of im2sequence op,");
     AddAttr<std::vector<int>>("kernels",
                               "(vector<int>), the "
-                              "kernels(kernel_height, kernel_width)")
-        AddAttr<std::vector<int>>("strides",
-                                  "(vector<int> default:{1, 1}), the "
-                                  "strides(h_stride, w_stride)")
-            .SetDefault({1, 1});
+                              "kernels(kernel_height, kernel_width)");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int> default:{1, 1}), the "
+                              "strides(h_stride, w_stride)")
+        .SetDefault({1, 1});
     AddAttr<std::vector<int>>("paddings",
                               "(vector<int> default:{0, 0, 0, 0}), the "
                               "paddings(up_pad, left_pad, down_pad, right_pad)")
diff --git a/paddle/operators/im2sequence_op.h b/paddle/operators/im2sequence_op.h
index 352d290b1b..aeb8100151 100644
--- a/paddle/operators/im2sequence_op.h
+++ b/paddle/operators/im2sequence_op.h
@@ -50,11 +50,11 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
-    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    int output_height =
-        OutputSize(img_height, kernels[0], paddings[0], paddings[2] strides[0]);
+    auto kernels = ctx.Attr<std::vector<int>>("kernels");
+    auto strides = ctx.Attr<std::vector<int>>("strides");
+    auto paddings = ctx.Attr<std::vector<int>>("paddings");
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
     int output_width =
         OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
 
@@ -106,9 +106,9 @@ class Im2SequenceGradKernel : public framework::OpKernel<T> {
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
-    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
-    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    auto kernels = ctx.Attr<std::vector<int>>("kernels");
+    auto strides = ctx.Attr<std::vector<int>>("strides");
+    auto paddings = ctx.Attr<std::vector<int>>("paddings");
     int output_height = OutputSize(img_height, kernels[0], paddings[0],
                                    paddings[2], strides[0]);
     int output_width =
diff --git a/python/paddle/v2/fluid/tests/test_im2sequence_op.py b/python/paddle/v2/fluid/tests/test_im2sequence_op.py
index cd1b2164f0..2cab3e31a5 100644
--- a/python/paddle/v2/fluid/tests/test_im2sequence_op.py
+++ b/python/paddle/v2/fluid/tests/test_im2sequence_op.py
@@ -20,22 +20,19 @@ def get_output_shape(attrs, in_shape):
     img_height = in_shape[2]
     img_width = in_shape[3]
 
-    padding_height = attrs['padding_height']
-    padding_width = attrs['padding_width']
-    block_height = attrs['block_height']
-    block_width = attrs['block_width']
-    stride_height = attrs['stride_height']
-    stride_width = attrs['stride_width']
+    paddings = attrs['paddings']
+    kernels = attrs['kernels']
+    strides = attrs['strides']
 
     output_height = \
       1 +  \
-      (img_height + 2 * padding_height - block_height + stride_height - 1) / \
-          stride_height
+      (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+          strides[0]
 
     output_width = \
       1 + \
-      (img_width + 2 * padding_width - block_width + stride_width - 1) / \
-          stride_width
+      (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+          strides[1]
 
     return output_height, output_width
 
@@ -46,19 +43,11 @@ def im2col(attrs, im, col):
     col:
         {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
     """
-    input_channels = im.shape[0]
-    input_height = im.shape[1]
-    input_width = im.shape[2]
+    input_channels, input_height, input_width = im.shape
+    output_height, output_width, _, filter_height, filter_width = col.shape
 
-    output_height = col.shape[0]
-    output_width = col.shape[1]
-    filter_height = col.shape[3]
-    filter_width = col.shape[4]
-
-    stride_height = attrs['stride_height']
-    stride_width = attrs['stride_width']
-    padding_height = attrs['padding_height']
-    padding_width = attrs['padding_width']
+    stride_height, stride_width = attrs['strides']
+    padding_height, padding_width = attrs['paddings'][0:2]
 
     for col_row_idx in range(0, output_height):
         for col_col_idx in range(0, output_width):
@@ -92,7 +81,7 @@ def Im2Sequence(inputs, attrs):
     batch_size = inputs.shape[0]
     out = np.zeros([
         batch_size, output_height, output_width, img_channels,
-        attrs['block_height'], attrs['block_width']
+        attrs['kernels'][0], attrs['kernels'][1]
     ]).astype("float32")
 
     for i in range(len(inputs)):
@@ -100,7 +89,7 @@ def Im2Sequence(inputs, attrs):
 
     out = out.reshape([
         batch_size * output_height * output_width,
-        img_channels * attrs['block_height'] * attrs['block_width']
+        img_channels * attrs['kernels'][0] * attrs['kernels'][1]
     ])
     return out
 
@@ -112,12 +101,9 @@ class TestBlockExpandOp(OpTest):
         self.img_height = 4
         self.img_width = 4
         self.attrs = {
-            'block_height': 2,
-            'block_width': 2,
-            'stride_height': 1,
-            'stride_width': 1,
-            'padding_height': 1,
-            'padding_width': 1,
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [1, 1, 1, 1]
         }
 
     def setUp(self):
@@ -145,12 +131,9 @@ class TestBlockExpandOpCase2(TestBlockExpandOp):
         self.img_height = 4
         self.img_width = 5
         self.attrs = {
-            'block_height': 2,
-            'block_width': 1,
-            'stride_height': 2,
-            'stride_width': 1,
-            'padding_height': 2,
-            'padding_width': 1,
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 1, 2, 1]
         }
 
 
@@ -161,12 +144,9 @@ class TestBlockExpandOpCase3(TestBlockExpandOp):
         self.img_height = 4
         self.img_width = 5
         self.attrs = {
-            'block_height': 2,
-            'block_width': 1,
-            'stride_height': 2,
-            'stride_width': 1,
-            'padding_height': 2,
-            'padding_width': 0,
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 0, 2, 0]
         }
 
 
@@ -177,12 +157,9 @@ class TestBlockExpandOpCase4(TestBlockExpandOp):
         self.img_height = 3
         self.img_width = 3
         self.attrs = {
-            'block_height': 2,
-            'block_width': 2,
-            'stride_height': 1,
-            'stride_width': 1,
-            'padding_height': 0,
-            'padding_width': 0,
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [0, 0, 0, 0]
         }
 
 

From c9e208c84593362656663f5e59f787b77ff44875 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 22 Jan 2018 19:19:06 +0800
Subject: [PATCH 18/18] Fix white space in comments.

---
 paddle/operators/im2sequence_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/im2sequence_op.cc b/paddle/operators/im2sequence_op.cc
index 1854fc384c..31baaedf69 100644
--- a/paddle/operators/im2sequence_op.cc
+++ b/paddle/operators/im2sequence_op.cc
@@ -56,12 +56,12 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
   Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(Tensor)The input tensor has NCHW format."
+             "(Tensor) The input tensor has NCHW format."
              "N: batch size"
              "C: channels"
              "H: height"
              "W: width");
-    AddOutput("Out", "(LodTensor)The output data of im2sequence op,");
+    AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
     AddAttr<std::vector<int>>("kernels",
                               "(vector<int>), the "
                               "kernels(kernel_height, kernel_width)");