From fef617ae072856bae17edd98cbddf88d198c95d0 Mon Sep 17 00:00:00 2001
From: wanghaox <wanghaox>
Date: Sat, 11 Nov 2017 19:59:20 +0800
Subject: [PATCH 01/36] for resolve conflicts

---
 paddle/operators/math/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index b39a64c0f3..d55aed19cb 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -8,22 +8,22 @@ if(WITH_GPU)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
     nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
-    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
+    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
     cc_library(pooling SRCS pooling.cc DEPS device_context)
-    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
     cc_library(vol2col SRCS vol2col.cc DEPS device_context)
     cc_library(context_project SRCS context_project.cc DEPS device_context)
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
+    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
 endif()
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)

From 4748073dc6793539d318fb7bc437c50fc8826373 Mon Sep 17 00:00:00 2001
From: wanghaox <wanghaox>
Date: Sat, 11 Nov 2017 20:10:54 +0800
Subject: [PATCH 02/36] paddle/operators/math/CMakeLists.txt maybe del
 sequence_pooling and add it

---
 paddle/operators/math/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index d55aed19cb..b330f30d21 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -8,6 +8,7 @@ if(WITH_GPU)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
     nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
@@ -19,6 +20,7 @@ else()
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
     cc_library(pooling SRCS pooling.cc DEPS device_context)
+    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
     cc_library(vol2col SRCS vol2col.cc DEPS device_context)
     cc_library(context_project SRCS context_project.cc DEPS device_context)
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)

From bc45335e552b90f1119a8eeec33da216f3cfada8 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Tue, 21 Nov 2017 14:52:54 +0800
Subject: [PATCH 03/36] add unpool

---
 paddle/operators/CMakeLists.txt      |   2 +
 paddle/operators/math/CMakeLists.txt |   2 +
 paddle/operators/math/unpooling.cc   | 110 +++++++++++++++++++++
 paddle/operators/math/unpooling.cu   | 143 +++++++++++++++++++++++++++
 paddle/operators/math/unpooling.h    |  48 +++++++++
 paddle/operators/unpool_op.cc        | 116 ++++++++++++++++++++++
 paddle/operators/unpool_op.cu.cc     |  22 +++++
 paddle/operators/unpool_op.h         |  85 ++++++++++++++++
 8 files changed, 528 insertions(+)
 create mode 100644 paddle/operators/math/unpooling.cc
 create mode 100644 paddle/operators/math/unpooling.cu
 create mode 100644 paddle/operators/math/unpooling.h
 create mode 100644 paddle/operators/unpool_op.cc
 create mode 100644 paddle/operators/unpool_op.cu.cc
 create mode 100644 paddle/operators/unpool_op.h

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index d39f7bf452..c720cce182 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -139,6 +139,7 @@ set(DEPS_OPS
     sum_op
     pool_op
     maxout_op
+    unpool_op
     pool_with_index_op
     nccl_op
     sequence_conv_op
@@ -151,6 +152,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
+op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 if(WITH_GPU)
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index b330f30d21..cd7e33cd7c 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -14,6 +14,7 @@ if(WITH_GPU)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
     nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
+    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -26,6 +27,7 @@ else()
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
     cc_library(maxouting SRCS maxouting.cc DEPS device_context)
+    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
 endif()
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
new file mode 100644
index 0000000000..36506b903e
--- /dev/null
+++ b/paddle/operators/math/unpooling.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/maxouting.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// All tensors are in NCHW format
+template <typename T>
+class Unpool2d_Max_Functor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  framework::Tensor * output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const T* indices_data = indices.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index =  indices_data[i];
+          if(index > output_feasize) {
+            //抛一个异常！
+          }
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+
+
+
+template <class T>
+class Unpool2d_MaxGradFunctor<platform::CPUPlace, T> {
+public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  framework::Tensor * input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const T* indices_data = indices.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int f = 0; f < input_feasize; ++f) {
+          int index = indices_data[i];
+          if(index > output_feasize) {
+            //抛一个异常！
+          }
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+
+template class Unpool2d_MaxGradFunctor<platform::CPUPlace, float>;
+template class Unpool2d_MaxGradFunctor<platform::CPUPlace, double>;
+template class Unpool2d_MaxFunctor<platform::CPUPlace, float>;
+template class Unpool2d_MaxFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
new file mode 100644
index 0000000000..53e88a57c1
--- /dev/null
+++ b/paddle/operators/math/unpooling.cu
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/maxouting.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__global__ void KernelUnpool2dMax(const int nthreads,
+                                  const T* input_data,
+                                  const T* indices_data,
+                                  const int input_height,
+                                  const int input_width,
+                                  T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int out_offset =  i / (input_height * input_width) \
+                      * output_height * output_width;
+    int out_index = indices_data[i];
+    output_data[out_offset + out_index] = input_data[i];
+  }
+}
+template <typename T>
+__global__ void KernelUnpool2dMaxGrad(const int nthreads,
+                                      const T* input_data,
+                                      const int input_height,
+                                      const int input_width,
+                                      const T* output_data,
+                                      const T* output_grad,
+                                      const int output_height,
+                                      const int output_width,
+                                      T* input_grad) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int offset = blockDim.x * gridDim.x;
+    for (int i = index; i < nthreads; i += offset) {
+        int out_offset =  i / (input_height * input_width) \
+                          * output_height * output_width;
+        int out_index = indices_data[i];
+        input_grad[i] = output_grad[out_offset + out_index];
+    }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2d_MaxFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  framework::Tensor * output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const T* indices_data = indices.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int nthreads =  output->numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelUnpool2dMax<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, indices_data,
+                              input_height, input_width,
+                              output_data, output_height, output_width);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2d_MaxGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor * input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+
+    const T* input_data = input.data<T>();
+    const T* indices_data = indices.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads =  output.numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelUnpool2dMaxGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+                              nthreads, input_data, indices_data,
+                              input_height, input_width,
+                              output_data, output_grad_data,
+                              output_height, output_width,
+                              input_grad_data);
+  }
+};
+
+template class Unpool2d_MaxGradFunctor<platform::GPUPlace, float>;
+template class Unpool2d_MaxGradFunctor<platform::GPUPlace, double>;
+
+template class Unpool2d_MaxFunctor<platform::GPUPlace, float>;
+template class Unpool2d_MaxFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
new file mode 100644
index 0000000000..bb0e0d08f0
--- /dev/null
+++ b/paddle/operators/math/unpooling.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX \
+    __FLT_MAX__
+
+template <typename Place, typename T>
+
+class Unpool2d_Max_Functor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  framework::Tensor * output);
+};
+
+template <typename Place, class T>
+class Unpool2d_Max_GradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor * input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
new file mode 100644
index 0000000000..d81428e802
--- /dev/null
+++ b/paddle/operators/unpool_op.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UnpoolOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput("Y",
+        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+        "(Tensor) The output tensor of unpool operator."
+        "The format of output tensor is also NCHW."
+        "Where N is batch size, C is "
+        "the number of channels, H and W is the height and "
+        "width of feature.");
+    AddAttr<std::vector<int>>("ksize",
+        "(vector ), the unpooling window size(height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>("strides",                                                                        "(vector, default:{1, 1}), "
+        "strides(height, width) of unpooling operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",                                                                       "(vector defalut:{0,0}), "
+        "paddings(height, width) of unpooling operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>("unpoolingType",
+        "(string), unpooling type, can be \"max\" for max-unpooling "
+        "and \"avg\" for average-unpooling.")
+        .InEnum({"max", "avg"});
+    AddComment(R"DOC(
+
+        )DOC");
+  }
+};
+
+int OutputSize(int input_size, int ksize, int padding, int stride) {
+  int output_size = (input_size -1) * stride - 2 * padding + ksize;
+  return output_size;
+}
+
+class UnpoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnpoolOp should not be null.");
+
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Y");
+    std::string unpooling_type = ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                    "Unpooling intput should be 4-D or 5-D tensor.");
+
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+        OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class UnpoolOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                                  "Input(Out@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                                  "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}    // namespace operators
+}    // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(unpool2d, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool2d_grad,
+            ops::UnpoolOpGrad);
+REGISTER_OP_CPU_KERNEL(unpool2d, ops::UnpoolKernel<paddle::platform::CPUPlace,
+                        float>);
+REGISTER_OP_CPU_KERNEL(unpool2d_grad,
+                        ops::UnpoolGradKernel<paddle::platform::CPUPlace,
+                        float>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
new file mode 100644
index 0000000000..8aeef8b3cf
--- /dev/null
+++ b/paddle/operators/unpool_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(unpool2d,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(unpool2d_grad,
+                       ops::UnpoolGradKernel<paddle::platform::GPUPlace,
+                        float>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
new file mode 100644
index 0000000000..38903dee17
--- /dev/null
+++ b/paddle/operators/unpool_op.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/unpooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class UnpoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* in_y = context.Input<Tensor>("Y");
+    Tensor* out = context.Output<Tensor>("Out");
+    std::string pooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    switch (ksize.size()) {
+    case 2: {
+      if (pooling_type == "max") {
+        math::Unpool2d_Max_Functor<Place, T> unpool2d_max_forward;
+        unpool2d_max_forward(context.device_context(), *in_x, *in_y,
+                             ksize, strides, paddings, out);
+      }
+    } break;
+    default: { PADDLE_THROW("Pool op only supports 2D input."); }
+    }
+  }
+};
+
+template <typename Place, typename T>
+class UnpoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* in_y = context.Input<Tensor>("Y");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    std::string pooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.device_context();
+    math::SetConstant<Place, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
+          }
+    switch (ksize.size()) {
+    case 2: {
+    if (pooling_type == "max") {
+      math::UnpoolGradFunctor<Place, T> maxout_backward;
+      maxout_backward(context.device_context(), *in_x, *in_y, in_x_grad, *out,
+                      *out_grad, ksize, strides, paddings);
+      }
+    } break;
+    default: { PADDLE_THROW("Pool op only supports 2D input."); }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

From 45a8c9ddaf5d16fdeeb6a424988d23c121d207b4 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Tue, 21 Nov 2017 16:28:51 +0800
Subject: [PATCH 04/36] add unpool2d make ok

---
 paddle/operators/CMakeLists.txt    |  7 +++++++
 paddle/operators/math/unpooling.cc | 26 ++++++++++----------------
 paddle/operators/math/unpooling.cu | 21 ++++++++++++---------
 paddle/operators/math/unpooling.h  |  5 +++--
 paddle/operators/unpool_op.cc      | 25 ++++++++++++++++---------
 paddle/operators/unpool_op.cu.cc   |  7 +++++--
 paddle/operators/unpool_op.h       | 13 ++++++-------
 7 files changed, 59 insertions(+), 45 deletions(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index ee25abd6cb..d53bca277d 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -80,6 +80,13 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
     endif()
 
+    # unpool_op contains several operators
+    if ("${TARGET}" STREQUAL "unpool_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(unpool2d);\n")
+    endif()
+
     # pool_cudnn_op contains several operators
     if ("${TARGET}" STREQUAL "pool_cudnn_op")
         set(pybind_flag 1)
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index 36506b903e..8cfdb4bb60 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/maxouting.h"
+#include "paddle/operators/math/unpooling.h"
 
 namespace paddle {
 namespace operators {
@@ -20,7 +20,7 @@ namespace math {
 
 // All tensors are in NCHW format
 template <typename T>
-class Unpool2d_Max_Functor<platform::CPUPlace, T> {
+class Unpool2d_MaxFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -36,16 +36,14 @@ class Unpool2d_Max_Functor<platform::CPUPlace, T> {
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
     const T* input_data = input.data<T>();
-    const T* indices_data = indices.data<T>();
+    const int * indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index =  indices_data[i];
-          if(index > output_feasize) {
-            //抛一个异常！
-          }
+          // PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
           output_data[index] = input_data[i];
         }
         input_data += input_feasize;
@@ -70,26 +68,22 @@ public:
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
 
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
-    const T* input_data = input.data<T>();
-    const T* indices_data = indices.data<T>();
-    const T* output_data = output.data<T>();
+    const int* indices_data = indices.data<int>();
     const T* output_grad_data = output_grad.data<T>();
 
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
-        for (int f = 0; f < input_feasize; ++f) {
+        for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          if(index > output_feasize) {
-            //抛一个异常！
-          }
+          // PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
           input_grad_data[i] = output_grad_data[index];
         }
         input_grad_data += input_feasize;
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index 53e88a57c1..c8e7b25234 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/maxouting.h"
+#include "paddle/operators/math/unpooling.h"
 #include "paddle/platform/cuda_helper.h"
 
 namespace paddle {
@@ -22,7 +22,7 @@ namespace math {
 template <typename T>
 __global__ void KernelUnpool2dMax(const int nthreads,
                                   const T* input_data,
-                                  const T* indices_data,
+                                  const int* indices_data,
                                   const int input_height,
                                   const int input_width,
                                   T* output_data,
@@ -30,16 +30,19 @@ __global__ void KernelUnpool2dMax(const int nthreads,
                                   const int output_width) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
+  // int output_feasize = output_height * output_width;
   for (int i = index; i < nthreads; i += offset) {
     int out_offset =  i / (input_height * input_width) \
                       * output_height * output_width;
     int out_index = indices_data[i];
+    // PADDLE_ENFORCE(out_index < output_feasize, "err index in unpooling!");
     output_data[out_offset + out_index] = input_data[i];
   }
 }
 template <typename T>
 __global__ void KernelUnpool2dMaxGrad(const int nthreads,
                                       const T* input_data,
+                                      const int* indices_data,
                                       const int input_height,
                                       const int input_width,
                                       const T* output_data,
@@ -49,10 +52,13 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads,
                                       T* input_grad) {
     int index = blockIdx.x * blockDim.x + threadIdx.x;
     int offset = blockDim.x * gridDim.x;
+    // int output_feasize = output_height * output_width;
     for (int i = index; i < nthreads; i += offset) {
         int out_offset =  i / (input_height * input_width) \
                           * output_height * output_width;
         int out_index = indices_data[i];
+        // PADDLE_ENFORCE(out_index < output_feasize,
+         //                   "err index in unpooling!");
         input_grad[i] = output_grad[out_offset + out_index];
     }
 }
@@ -72,10 +78,8 @@ class Unpool2d_MaxFunctor<platform::GPUPlace, T> {
     const int output_channels = output->dims()[1];
     const int output_height = output->dims()[2];
     const int output_width = output->dims()[3];
-    int input_feasize = input_height * input_width;
-    int output_feasize = output_height * output_width;
     const T* input_data = input.data<T>();
-    const T* indices_data = indices.data<T>();
+    const int* indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     int nthreads =  output->numel();
@@ -99,19 +103,18 @@ class Unpool2d_MaxGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
+                  const framework::Tensor& indices,
                   framework::Tensor * input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups) {
+                  const framework::Tensor& output_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
     const int output_channels = output.dims()[1];
     const int output_height = output.dims()[2];
     const int output_width = output.dims()[3];
-
     const T* input_data = input.data<T>();
-    const T* indices_data = indices.data<T>();
+    const int* indices_data = indices.data<int>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index bb0e0d08f0..ba4be89746 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -26,7 +26,7 @@ namespace math {
 
 template <typename Place, typename T>
 
-class Unpool2d_Max_Functor {
+class Unpool2d_MaxFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -35,10 +35,11 @@ class Unpool2d_Max_Functor {
 };
 
 template <typename Place, class T>
-class Unpool2d_Max_GradFunctor {
+class Unpool2d_MaxGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
+                  const framework::Tensor& indices,
                   framework::Tensor * input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad);
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index d81428e802..9d6e69dffb 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -20,7 +20,8 @@ using framework::Tensor;
 
 class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UnpoolOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  Unpool2dOpMaker(framework::OpProto* proto,  \
+                  framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
         "(Tensor) The input tensor of unpool operator. "
@@ -39,10 +40,12 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>("ksize",
         "(vector ), the unpooling window size(height, width) "
         "of unpooling operator.");
-    AddAttr<std::vector<int>>("strides",                                                                        "(vector, default:{1, 1}), "
+    AddAttr<std::vector<int>>("strides",
+        "(vector, default:{1, 1}), "
         "strides(height, width) of unpooling operator.")
         .SetDefault({1, 1});
-    AddAttr<std::vector<int>>("paddings",                                                                       "(vector defalut:{0,0}), "
+    AddAttr<std::vector<int>>("paddings",
+        "(vector defalut:{0,0}), "
         "paddings(height, width) of unpooling operator.")
         .SetDefault({0, 0});
     AddAttr<std::string>("unpoolingType",
@@ -73,7 +76,8 @@ class UnpoolOp : public framework::OperatorWithKernel {
 
     auto in_x_dims = ctx->GetInputDim("X");
     auto in_y_dims = ctx->GetInputDim("Y");
-    std::string unpooling_type = ctx->Attrs().Get<std::string>("unpooling_type");
+    std::string unpooling_type =  \
+      ctx->Attrs().Get<std::string>("unpooling_type");
     std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
@@ -95,7 +99,7 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                                   "Input(Out@GRAD) should not be null");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
@@ -109,8 +113,11 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(unpool2d, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool2d_grad,
             ops::UnpoolOpGrad);
-REGISTER_OP_CPU_KERNEL(unpool2d, ops::UnpoolKernel<paddle::platform::CPUPlace,
-                        float>);
+REGISTER_OP_CPU_KERNEL(unpool2d,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(unpool2d_grad,
-                        ops::UnpoolGradKernel<paddle::platform::CPUPlace,
-                        float>);
+                      ops::UnpoolGradKernel<paddle::platform::CPUPlace,
+                      float>,
+                      ops::UnpoolGradKernel<paddle::platform::CPUPlace,
+                      double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
index 8aeef8b3cf..96fb9e40c3 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
@@ -16,7 +16,10 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(unpool2d,
-                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>);
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(unpool2d_grad,
                        ops::UnpoolGradKernel<paddle::platform::GPUPlace,
-                        float>);
+                        float>,
+                       ops::UnpoolGradKernel<paddle::platform::GPUPlace,
+                        double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index 38903dee17..47dd8da6f7 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -37,9 +37,8 @@ class UnpoolKernel : public framework::OpKernel<T> {
     switch (ksize.size()) {
     case 2: {
       if (pooling_type == "max") {
-        math::Unpool2d_Max_Functor<Place, T> unpool2d_max_forward;
-        unpool2d_max_forward(context.device_context(), *in_x, *in_y,
-                             ksize, strides, paddings, out);
+        math::Unpool2d_MaxFunctor<Place, T> unpool2d_max_forward;
+        unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
       }
     } break;
     default: { PADDLE_THROW("Pool op only supports 2D input."); }
@@ -71,12 +70,12 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     switch (ksize.size()) {
     case 2: {
     if (pooling_type == "max") {
-      math::UnpoolGradFunctor<Place, T> maxout_backward;
-      maxout_backward(context.device_context(), *in_x, *in_y, in_x_grad, *out,
-                      *out_grad, ksize, strides, paddings);
+      math::Unpool2d_MaxGradFunctor<Place, T> unpool2d_max_backward;
+      unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad,
+                            *out, *out_grad);
       }
     } break;
-    default: { PADDLE_THROW("Pool op only supports 2D input."); }
+    default: { PADDLE_THROW("Unpool op only supports 2D input."); }
     }
   }
 };

From 200f07c2197bb3e35cfcbfcc7dbb201d241a4069 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Tue, 21 Nov 2017 17:44:44 +0800
Subject: [PATCH 05/36]  add test

---
 paddle/operators/math/unpooling.cc            | 16 +++----
 paddle/operators/math/unpooling.cu            | 19 ++++----
 paddle/operators/math/unpooling.h             |  4 +-
 paddle/operators/unpool_op.cc                 | 21 ++++++---
 paddle/operators/unpool_op.h                  |  4 +-
 .../paddle/v2/fluid/tests/test_unpool2d_op.py | 47 +++++++++++++++++++
 6 files changed, 82 insertions(+), 29 deletions(-)
 create mode 100644 python/paddle/v2/fluid/tests/test_unpool2d_op.py

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index 8cfdb4bb60..a1747e76e7 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -20,7 +20,7 @@ namespace math {
 
 // All tensors are in NCHW format
 template <typename T>
-class Unpool2d_MaxFunctor<platform::CPUPlace, T> {
+class Unpool2dMaxFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -43,7 +43,7 @@ class Unpool2d_MaxFunctor<platform::CPUPlace, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index =  indices_data[i];
-          // PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
           output_data[index] = input_data[i];
         }
         input_data += input_feasize;
@@ -57,7 +57,7 @@ class Unpool2d_MaxFunctor<platform::CPUPlace, T> {
 
 
 template <class T>
-class Unpool2d_MaxGradFunctor<platform::CPUPlace, T> {
+class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
 public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -83,7 +83,7 @@ public:
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          // PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
           input_grad_data[i] = output_grad_data[index];
         }
         input_grad_data += input_feasize;
@@ -94,10 +94,10 @@ public:
   }
 };
 
-template class Unpool2d_MaxGradFunctor<platform::CPUPlace, float>;
-template class Unpool2d_MaxGradFunctor<platform::CPUPlace, double>;
-template class Unpool2d_MaxFunctor<platform::CPUPlace, float>;
-template class Unpool2d_MaxFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index c8e7b25234..f14dd0626f 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -30,12 +30,11 @@ __global__ void KernelUnpool2dMax(const int nthreads,
                                   const int output_width) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
-  // int output_feasize = output_height * output_width;
   for (int i = index; i < nthreads; i += offset) {
     int out_offset =  i / (input_height * input_width) \
                       * output_height * output_width;
     int out_index = indices_data[i];
-    // PADDLE_ENFORCE(out_index < output_feasize, "err index in unpooling!");
+    PADDLE_ASSERT(out_index < (output_height * output_width));
     output_data[out_offset + out_index] = input_data[i];
   }
 }
@@ -52,13 +51,11 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads,
                                       T* input_grad) {
     int index = blockIdx.x * blockDim.x + threadIdx.x;
     int offset = blockDim.x * gridDim.x;
-    // int output_feasize = output_height * output_width;
     for (int i = index; i < nthreads; i += offset) {
         int out_offset =  i / (input_height * input_width) \
                           * output_height * output_width;
         int out_index = indices_data[i];
-        // PADDLE_ENFORCE(out_index < output_feasize,
-         //                   "err index in unpooling!");
+        PADDLE_ASSERT(out_index < (output_height * output_width));
         input_grad[i] = output_grad[out_offset + out_index];
     }
 }
@@ -66,7 +63,7 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads,
  * All tensors are in NCHW format.
  */
 template <typename T>
-class Unpool2d_MaxFunctor<platform::GPUPlace, T> {
+class Unpool2dMaxFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -99,7 +96,7 @@ class Unpool2d_MaxFunctor<platform::GPUPlace, T> {
  * All tensors are in NCHW format.
  */
 template <typename T>
-class Unpool2d_MaxGradFunctor<platform::GPUPlace, T> {
+class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -135,11 +132,11 @@ class Unpool2d_MaxGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class Unpool2d_MaxGradFunctor<platform::GPUPlace, float>;
-template class Unpool2d_MaxGradFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
 
-template class Unpool2d_MaxFunctor<platform::GPUPlace, float>;
-template class Unpool2d_MaxFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index ba4be89746..93a77bf53e 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -26,7 +26,7 @@ namespace math {
 
 template <typename Place, typename T>
 
-class Unpool2d_MaxFunctor {
+class Unpool2dMaxFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -35,7 +35,7 @@ class Unpool2d_MaxFunctor {
 };
 
 template <typename Place, class T>
-class Unpool2d_MaxGradFunctor {
+class Unpool2dMaxGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index 9d6e69dffb..d450d9f62a 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -49,11 +49,15 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "paddings(height, width) of unpooling operator.")
         .SetDefault({0, 0});
     AddAttr<std::string>("unpoolingType",
-        "(string), unpooling type, can be \"max\" for max-unpooling "
-        "and \"avg\" for average-unpooling.")
-        .InEnum({"max", "avg"});
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
     AddComment(R"DOC(
-
+          "input: the input Tensor to invert"
+          "indices: the indices given out by MaxPool2d"
+          "ksize  – Size of the max pooling window."
+          "stride – Stride of the max pooling window."
+                   "It is set to kernel_size by default."
+          "padding – Padding that was added to the input"
         )DOC");
   }
 };
@@ -82,8 +86,13 @@ class UnpoolOp : public framework::OperatorWithKernel {
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
-    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                    "Unpooling intput should be 4-D or 5-D tensor.");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                    "Unpooling intput should be 4-D.");
+    for (int i = 0; i < 4; ++i) {
+      PADDLE_ENFORCE(in_x_dims[i] == in_y_dims[i],
+                     "X size must be eq Y size!");
+    }
+
 
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
     for (size_t i = 0; i < ksize.size(); ++i) {
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index 47dd8da6f7..44115b0726 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -37,7 +37,7 @@ class UnpoolKernel : public framework::OpKernel<T> {
     switch (ksize.size()) {
     case 2: {
       if (pooling_type == "max") {
-        math::Unpool2d_MaxFunctor<Place, T> unpool2d_max_forward;
+        math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
         unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
       }
     } break;
@@ -70,7 +70,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     switch (ksize.size()) {
     case 2: {
     if (pooling_type == "max") {
-      math::Unpool2d_MaxGradFunctor<Place, T> unpool2d_max_backward;
+      math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
       unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad,
                             *out, *out_grad);
       }
diff --git a/python/paddle/v2/fluid/tests/test_unpool2d_op.py b/python/paddle/v2/fluid/tests/test_unpool2d_op.py
new file mode 100644
index 0000000000..08f734a264
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_unpool2d_op.py
@@ -0,0 +1,47 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def maxout_forward_naive(input, groups):
+    s0, s1, s2, s3 = input.shape
+    return np.ndarray([s0, s1 / groups, groups, s2, s3], \
+        buffer = input, dtype=input.dtype).max(axis=(2))
+
+
+class TestUnpool2dOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool2d"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.MaxOut_forward_naive(input, self.groups).astype("float32")
+
+        self.inputs = {'X': input}
+        self.attrs = {
+                 'strides': self.strides,
+                 'paddings': self.paddings,
+                 'ksize': self.ksize,
+                 'unpooling_type': self.pool_type,
+                 }
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def init_pool_type(self):
+                self.pool_type = "max"
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.MaxOut_forward_naive = maxout_forward_naive
+        self.shape = [100, 6, 2, 2]
+        self.groups=2
+
+
+
+
+if __name__ == '__main__':
+    unittest.main()

From 90f664d0b0eb4cb0f13a5ac5c434ed9cb6544687 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 22 Nov 2017 12:52:43 +0800
Subject: [PATCH 06/36] test unpool ok cpu

---
 paddle/operators/CMakeLists.txt               |  7 --
 paddle/operators/math/unpooling.cc            |  9 +--
 paddle/operators/math/unpooling.cu            |  4 +-
 paddle/operators/unpool_op.cc                 | 25 +++----
 paddle/operators/unpool_op.cu.cc              |  4 +-
 paddle/operators/unpool_op.h                  |  8 +-
 .../paddle/v2/fluid/tests/test_unpool2d_op.py | 47 ------------
 .../paddle/v2/fluid/tests/test_unpool_op.py   | 74 +++++++++++++++++++
 8 files changed, 98 insertions(+), 80 deletions(-)
 delete mode 100644 python/paddle/v2/fluid/tests/test_unpool2d_op.py
 create mode 100644 python/paddle/v2/fluid/tests/test_unpool_op.py

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index d53bca277d..ee25abd6cb 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -80,13 +80,6 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
     endif()
 
-    # unpool_op contains several operators
-    if ("${TARGET}" STREQUAL "unpool_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(unpool2d);\n")
-    endif()
-
     # pool_cudnn_op contains several operators
     if ("${TARGET}" STREQUAL "pool_cudnn_op")
         set(pybind_flag 1)
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index a1747e76e7..0becab721e 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -32,13 +32,13 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
     const int output_channels = output->dims()[1];
     const int output_height = output->dims()[2];
     const int output_width = output->dims()[3];
-
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
     const T* input_data = input.data<T>();
-    const int * indices_data = indices.data<int>();
+    const T * indices_data = indices.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-
+    memset(output_data, 0, \
+           sizeof(T) * output_feasize * output_channels * batch_size);
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
@@ -74,9 +74,8 @@ public:
 
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
-    const int* indices_data = indices.data<int>();
+    const T* indices_data = indices.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int b = 0; b < batch_size; ++b) {
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index f14dd0626f..cd313770ab 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -76,7 +76,7 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
     const int output_height = output->dims()[2];
     const int output_width = output->dims()[3];
     const T* input_data = input.data<T>();
-    const int* indices_data = indices.data<int>();
+    const T* indices_data = indices.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     int nthreads =  output->numel();
@@ -111,7 +111,7 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
     const int output_height = output.dims()[2];
     const int output_width = output.dims()[3];
     const T* input_data = input.data<T>();
-    const int* indices_data = indices.data<int>();
+    const T* indices_data = indices.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index d450d9f62a..9036005a4d 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -48,7 +48,7 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "(vector defalut:{0,0}), "
         "paddings(height, width) of unpooling operator.")
         .SetDefault({0, 0});
-    AddAttr<std::string>("unpoolingType",
+    AddAttr<std::string>("unpoolingtype",
         "(string), unpooling type, can be \"max\" for max-unpooling ")
         .InEnum({"max"});
     AddComment(R"DOC(
@@ -80,8 +80,8 @@ class UnpoolOp : public framework::OperatorWithKernel {
 
     auto in_x_dims = ctx->GetInputDim("X");
     auto in_y_dims = ctx->GetInputDim("Y");
-    std::string unpooling_type =  \
-      ctx->Attrs().Get<std::string>("unpooling_type");
+    std::string unpoolingtype =  \
+      ctx->Attrs().Get<std::string>("unpoolingtype");
     std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
@@ -108,9 +108,9 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                                  "Input(Out@GRAD) should not be null");
+    // PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+    //                           "Input(Out@GRAD) should not be null");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                                   "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
@@ -120,13 +120,12 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
 }    // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(unpool2d, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool2d_grad,
+REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
             ops::UnpoolOpGrad);
-REGISTER_OP_CPU_KERNEL(unpool2d,
+REGISTER_OP_CPU_KERNEL(unpool,
                        ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
                        ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(unpool2d_grad,
-                      ops::UnpoolGradKernel<paddle::platform::CPUPlace,
-                      float>,
-                      ops::UnpoolGradKernel<paddle::platform::CPUPlace,
-                      double>);
+REGISTER_OP_CPU_KERNEL(unpool_grad,
+                    ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
+                    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
+
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
index 96fb9e40c3..4949fc467e 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
@@ -15,10 +15,10 @@
 #include "paddle/operators/unpool_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(unpool2d,
+REGISTER_OP_GPU_KERNEL(unpool,
                        ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
                        ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(unpool2d_grad,
+REGISTER_OP_GPU_KERNEL(unpool_grad,
                        ops::UnpoolGradKernel<paddle::platform::GPUPlace,
                         float>,
                        ops::UnpoolGradKernel<paddle::platform::GPUPlace,
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index 44115b0726..452a328eee 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -30,13 +30,13 @@ class UnpoolKernel : public framework::OpKernel<T> {
     const Tensor* in_x = context.Input<Tensor>("X");
     const Tensor* in_y = context.Input<Tensor>("Y");
     Tensor* out = context.Output<Tensor>("Out");
-    std::string pooling_type = context.Attr<std::string>("unpooling_type");
+    std::string unpoolingtype = context.Attr<std::string>("unpoolingtype");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     switch (ksize.size()) {
     case 2: {
-      if (pooling_type == "max") {
+      if (unpoolingtype == "max") {
         math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
         unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
       }
@@ -56,7 +56,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     const Tensor* out_grad =
         context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    std::string pooling_type = context.Attr<std::string>("unpooling_type");
+    std::string unpoolingtype = context.Attr<std::string>("unpoolingtype");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
@@ -69,7 +69,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
           }
     switch (ksize.size()) {
     case 2: {
-    if (pooling_type == "max") {
+    if (unpoolingtype == "max") {
       math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
       unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad,
                             *out, *out_grad);
diff --git a/python/paddle/v2/fluid/tests/test_unpool2d_op.py b/python/paddle/v2/fluid/tests/test_unpool2d_op.py
deleted file mode 100644
index 08f734a264..0000000000
--- a/python/paddle/v2/fluid/tests/test_unpool2d_op.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def maxout_forward_naive(input, groups):
-    s0, s1, s2, s3 = input.shape
-    return np.ndarray([s0, s1 / groups, groups, s2, s3], \
-        buffer = input, dtype=input.dtype).max(axis=(2))
-
-
-class TestUnpool2dOp(OpTest):
-    def setUp(self):
-        self.op_type = "unpool2d"
-        self.init_test_case()
-        input = np.random.random(self.shape).astype("float32")
-        output = self.MaxOut_forward_naive(input, self.groups).astype("float32")
-
-        self.inputs = {'X': input}
-        self.attrs = {
-                 'strides': self.strides,
-                 'paddings': self.paddings,
-                 'ksize': self.ksize,
-                 'unpooling_type': self.pool_type,
-                 }
-
-        self.outputs = {'Out': output.astype('float32')}
-
-    def init_pool_type(self):
-                self.pool_type = "max"
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-    def init_test_case(self):
-        self.MaxOut_forward_naive = maxout_forward_naive
-        self.shape = [100, 6, 2, 2]
-        self.groups=2
-
-
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
new file mode 100644
index 0000000000..566da6e26e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -0,0 +1,74 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
+    s0, s1, s2, s3 = input.shape
+    out_H=(s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
+    out_W=(s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
+    out = np.zeros((s0, s1, out_H, out_W))
+    for nidx in xrange(s0):
+        for cidx in xrange(s1):
+            for h in xrange(s2):
+                for w in xrange(s3):
+                    index = indices[nidx, cidx, h, w]
+                    hidx = (index - index % out_W) / out_W
+                    widx = index % out_W
+                    out[nidx, cidx, int(hidx), int(widx)] = input[nidx, cidx, h, w]
+
+    return out
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool"
+        self.init_test_case()
+        pre_input = np.random.random(self.shape).astype("float32")
+        N, C, H, W = pre_input.shape
+        H_out = (H - self.ksize[0] + 2 * self.paddings[0]) / self.strides[0] + 1
+        W_out = (W - self.ksize[1] + 2 * self.paddings[1]) / self.strides[1] + 1
+        input = np.zeros((N, C, H_out, W_out))
+        indices = np.zeros((N, C, H_out, W_out))
+        for i in xrange(H_out):
+            for j in xrange(W_out):
+                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
+                r_end = np.min((i * self.strides[0] + self.ksize[0] - self.paddings[0], H))
+                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
+                c_end = np.min((j * self.strides[1] + self.ksize[1] - self.paddings[1], W))
+                for nidx in xrange(N):
+                    for cidx in xrange(C):
+                        x_masked = pre_input[nidx, cidx, r_start:r_end, c_start:c_end]
+                        input[nidx, cidx, i, j] = x_masked.max()
+                        arg = x_masked.argmax()
+                        indices[nidx, cidx, i, j] = (r_start + arg / self.ksize[1]) * W + c_start + arg % self.ksize[1]
+        output = self.Unpool2d_forward_naive(input, indices, self.ksize, self.strides, self.paddings).astype("float32")
+        self.inputs = {'X': input.astype('float32'),
+                       'Y': indices.astype('int16')}
+        self.attrs = {
+                 'strides': self.strides,
+                 'paddings': self.paddings,
+                 'ksize': self.ksize,
+                 'unpoolingtype': self.unpoolingtype,
+                 }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        print self.outputs['Out']
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.5)
+
+    def init_test_case(self):
+        self.Unpool2d_forward_naive = unpool2dmax_forward_naive
+        self.unpoolingtype = "max"
+        self.shape = [10, 2, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+
+
+if __name__ == '__main__':
+    unittest.main()

From e2a5905eaec4bafa2d469c94f9da5c01f9aae328 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 22 Nov 2017 15:38:17 +0800
Subject: [PATCH 07/36] gpu test ok unpool2dmax

---
 paddle/operators/math/unpooling.cc            |  2 -
 paddle/operators/math/unpooling.cu            | 42 ++++++++++++-------
 paddle/operators/math/unpooling.h             |  3 --
 paddle/operators/unpool_op.cc                 |  3 --
 paddle/operators/unpool_op.h                  |  9 +++-
 .../paddle/v2/fluid/tests/test_unpool_op.py   |  4 +-
 6 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index 0becab721e..1622dcca87 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -37,8 +37,6 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
     const T* input_data = input.data<T>();
     const T * indices_data = indices.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-    memset(output_data, 0, \
-           sizeof(T) * output_feasize * output_channels * batch_size);
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index cd313770ab..d26ceed6ad 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -22,41 +22,56 @@ namespace math {
 template <typename T>
 __global__ void KernelUnpool2dMax(const int nthreads,
                                   const T* input_data,
-                                  const int* indices_data,
+                                  const T* indices_data,
                                   const int input_height,
                                   const int input_width,
+                                  const int channels,
                                   T* output_data,
                                   const int output_height,
                                   const int output_width) {
+  int bsize = input_height * input_width * channels;
+  int csize = input_height * input_width;
+  int out_bsize = output_height * output_width * channels;
+  int out_csize = output_height * output_width;
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
   for (int i = index; i < nthreads; i += offset) {
-    int out_offset =  i / (input_height * input_width) \
-                      * output_height * output_width;
+    int bidx = i / bsize;
+    int boffset = i % bsize;
+    int cidx = boffset / csize;
+    int out_offset = bidx * out_bsize + cidx * out_csize;
     int out_index = indices_data[i];
     PADDLE_ASSERT(out_index < (output_height * output_width));
+    printf("-------%d------[%f]\n", out_offset + out_index, input_data[i]);
     output_data[out_offset + out_index] = input_data[i];
   }
 }
 template <typename T>
 __global__ void KernelUnpool2dMaxGrad(const int nthreads,
                                       const T* input_data,
-                                      const int* indices_data,
+                                      const T* indices_data,
                                       const int input_height,
                                       const int input_width,
+                                      const int channels,
                                       const T* output_data,
                                       const T* output_grad,
                                       const int output_height,
                                       const int output_width,
                                       T* input_grad) {
+    int bsize = input_height * input_width * channels;
+    int csize = input_height * input_width;
+    int out_bsize = output_height * output_width * channels;
+    int out_csize = output_height * output_width;
     int index = blockIdx.x * blockDim.x + threadIdx.x;
     int offset = blockDim.x * gridDim.x;
     for (int i = index; i < nthreads; i += offset) {
-        int out_offset =  i / (input_height * input_width) \
-                          * output_height * output_width;
-        int out_index = indices_data[i];
-        PADDLE_ASSERT(out_index < (output_height * output_width));
-        input_grad[i] = output_grad[out_offset + out_index];
+      int bidx = i / bsize;
+      int boffset = i % bsize;
+      int cidx = boffset / csize;
+      int out_offset = bidx * out_bsize + cidx * out_csize;
+      int out_index = indices_data[i];
+      PADDLE_ASSERT(out_index < (output_height * output_width));
+      input_grad[i] = output_grad[out_offset + out_index];
     }
 }
 /*
@@ -78,8 +93,7 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
     const T* input_data = input.data<T>();
     const T* indices_data = indices.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    int nthreads =  output->numel();
+    int nthreads = batch_size * output_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
@@ -88,7 +102,7 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(nthreads, input_data, indices_data,
-                              input_height, input_width,
+                              input_height, input_width, output_channels,
                               output_data, output_height, output_width);
   }
 };
@@ -115,7 +129,7 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads =  output.numel();
+    int nthreads = batch_size * output_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
@@ -125,7 +139,7 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
                               nthreads, input_data, indices_data,
-                              input_height, input_width,
+                              input_height, input_width, output_channels,
                               output_data, output_grad_data,
                               output_height, output_width,
                               input_grad_data);
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index 93a77bf53e..88e88ba117 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -21,9 +21,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX \
-    __FLT_MAX__
-
 template <typename Place, typename T>
 
 class Unpool2dMaxFunctor {
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index 9036005a4d..add8f15736 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -108,9 +108,6 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    // PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
-    // PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-    //                           "Input(Out@GRAD) should not be null");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                                   "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index 452a328eee..e3a45ff9a7 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -29,11 +29,16 @@ class UnpoolKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* in_x = context.Input<Tensor>("X");
     const Tensor* in_y = context.Input<Tensor>("Y");
-    Tensor* out = context.Output<Tensor>("Out");
+    auto * out = context.Output<Tensor>("Out");
     std::string unpoolingtype = context.Attr<std::string>("unpoolingtype");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    if (output_data) {
+      math::SetConstant<Place, T> set_zero;
+      set_zero(context.device_context(), out, static_cast<T>(0));
+    }
     switch (ksize.size()) {
     case 2: {
       if (unpoolingtype == "max") {
@@ -66,7 +71,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-          }
+    }
     switch (ksize.size()) {
     case 2: {
     if (unpoolingtype == "max") {
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
index 566da6e26e..7984743e6f 100644
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -54,6 +54,8 @@ class TestUnpoolOp(OpTest):
         self.outputs = {'Out': output.astype('float32')}
 
     def test_check_output(self):
+        print self.inputs['X']
+        print self.inputs['Y']
         print self.outputs['Out']
         self.check_output()
 
@@ -63,7 +65,7 @@ class TestUnpoolOp(OpTest):
     def init_test_case(self):
         self.Unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpoolingtype = "max"
-        self.shape = [10, 2, 5, 5]
+        self.shape = [6, 4, 5, 5]
         self.ksize = [3, 3]
         self.strides = [2, 2]
         self.paddings = [0, 0]

From 47bd0bb6787d049f094f2c883f54b7d314eedec1 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 22 Nov 2017 15:45:43 +0800
Subject: [PATCH 08/36] del printf

---
 paddle/operators/math/unpooling.cu             | 1 -
 python/paddle/v2/fluid/tests/test_unpool_op.py | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index d26ceed6ad..bb8489fb95 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -42,7 +42,6 @@ __global__ void KernelUnpool2dMax(const int nthreads,
     int out_offset = bidx * out_bsize + cidx * out_csize;
     int out_index = indices_data[i];
     PADDLE_ASSERT(out_index < (output_height * output_width));
-    printf("-------%d------[%f]\n", out_offset + out_index, input_data[i]);
     output_data[out_offset + out_index] = input_data[i];
   }
 }
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
index 7984743e6f..b1ddf95acc 100644
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -54,9 +54,6 @@ class TestUnpoolOp(OpTest):
         self.outputs = {'Out': output.astype('float32')}
 
     def test_check_output(self):
-        print self.inputs['X']
-        print self.inputs['Y']
-        print self.outputs['Out']
         self.check_output()
 
     def test_check_grad(self):

From 0112c5d640d7e311f99fab553d7da9ee6653865c Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 22 Nov 2017 15:51:52 +0800
Subject: [PATCH 09/36] format code

---
 paddle/operators/math/unpooling.cc |  1 -
 paddle/operators/math/unpooling.cu | 30 +++++++++++++++---------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index 1622dcca87..a3a24a6892 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -69,7 +69,6 @@ public:
     const int output_channels = output.dims()[1];
     const int output_height = output.dims()[2];
     const int output_width = output.dims()[3];
-
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
     const T* indices_data = indices.data<T>();
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index bb8489fb95..358847b315 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -29,21 +29,21 @@ __global__ void KernelUnpool2dMax(const int nthreads,
                                   T* output_data,
                                   const int output_height,
                                   const int output_width) {
-  int bsize = input_height * input_width * channels;
-  int csize = input_height * input_width;
-  int out_bsize = output_height * output_width * channels;
-  int out_csize = output_height * output_width;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int bidx = i / bsize;
-    int boffset = i % bsize;
-    int cidx = boffset / csize;
-    int out_offset = bidx * out_bsize + cidx * out_csize;
-    int out_index = indices_data[i];
-    PADDLE_ASSERT(out_index < (output_height * output_width));
-    output_data[out_offset + out_index] = input_data[i];
-  }
+    int bsize = input_height * input_width * channels;
+    int csize = input_height * input_width;
+    int out_bsize = output_height * output_width * channels;
+    int out_csize = output_height * output_width;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int offset = blockDim.x * gridDim.x;
+    for (int i = index; i < nthreads; i += offset) {
+      int bidx = i / bsize;
+      int boffset = i % bsize;
+      int cidx = boffset / csize;
+      int out_offset = bidx * out_bsize + cidx * out_csize;
+      int out_index = indices_data[i];
+      PADDLE_ASSERT(out_index < (output_height * output_width));
+      output_data[out_offset + out_index] = input_data[i];
+    }
 }
 template <typename T>
 __global__ void KernelUnpool2dMaxGrad(const int nthreads,

From e553d5728d52f4dd2ebc11228053ed31da05a62c Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 22 Nov 2017 15:59:02 +0800
Subject: [PATCH 10/36] format test code

---
 .../paddle/v2/fluid/tests/test_unpool_op.py   | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
index b1ddf95acc..106af9f5d9 100644
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -15,7 +15,8 @@ def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
                     index = indices[nidx, cidx, h, w]
                     hidx = (index - index % out_W) / out_W
                     widx = index % out_W
-                    out[nidx, cidx, int(hidx), int(widx)] = input[nidx, cidx, h, w]
+                    out[nidx, cidx, int(hidx), int(widx)] = \
+                            input[nidx, cidx, h, w]
 
     return out
 
@@ -26,23 +27,31 @@ class TestUnpoolOp(OpTest):
         self.init_test_case()
         pre_input = np.random.random(self.shape).astype("float32")
         N, C, H, W = pre_input.shape
-        H_out = (H - self.ksize[0] + 2 * self.paddings[0]) / self.strides[0] + 1
-        W_out = (W - self.ksize[1] + 2 * self.paddings[1]) / self.strides[1] + 1
+        H_out = (H - self.ksize[0] + 2 * self.paddings[0]) / \
+                self.strides[0] + 1
+        W_out = (W - self.ksize[1] + 2 * self.paddings[1]) / \
+                self.strides[1] + 1
         input = np.zeros((N, C, H_out, W_out))
         indices = np.zeros((N, C, H_out, W_out))
         for i in xrange(H_out):
             for j in xrange(W_out):
                 r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
-                r_end = np.min((i * self.strides[0] + self.ksize[0] - self.paddings[0], H))
+                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
+                        self.paddings[0], H))
                 c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
-                c_end = np.min((j * self.strides[1] + self.ksize[1] - self.paddings[1], W))
+                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
+                        self.paddings[1], W))
                 for nidx in xrange(N):
                     for cidx in xrange(C):
-                        x_masked = pre_input[nidx, cidx, r_start:r_end, c_start:c_end]
+                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
+                                c_start:c_end]
                         input[nidx, cidx, i, j] = x_masked.max()
                         arg = x_masked.argmax()
-                        indices[nidx, cidx, i, j] = (r_start + arg / self.ksize[1]) * W + c_start + arg % self.ksize[1]
-        output = self.Unpool2d_forward_naive(input, indices, self.ksize, self.strides, self.paddings).astype("float32")
+                        indices[nidx, cidx, i, j] = \
+                                (r_start + arg / self.ksize[1]) * W + \
+                                c_start + arg % self.ksize[1]
+        output = self.Unpool2d_forward_naive(input, indices, self.ksize, \
+                self.strides, self.paddings).astype("float32")
         self.inputs = {'X': input.astype('float32'),
                        'Y': indices.astype('int16')}
         self.attrs = {
@@ -57,7 +66,7 @@ class TestUnpoolOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.5)
+        self.check_grad(['X'], 'Out')
 
     def init_test_case(self):
         self.Unpool2d_forward_naive = unpool2dmax_forward_naive

From 66b84366f1e09366b28e41dbd0d3521152554115 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Thu, 23 Nov 2017 11:53:30 +0800
Subject: [PATCH 11/36] modify for code review by wangyi

---
 paddle/operators/unpool_op.cc | 26 +++++++++----------
 paddle/operators/unpool_op.h  | 47 ++++++++++++-----------------------
 2 files changed, 28 insertions(+), 45 deletions(-)

diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index add8f15736..b5f3d56e96 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -16,11 +16,9 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Unpool2dOpMaker(framework::OpProto* proto,  \
+  Unpool2dOpMaker(framework::OpProto* proto,
                   framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
@@ -38,26 +36,26 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "the number of channels, H and W is the height and "
         "width of feature.");
     AddAttr<std::vector<int>>("ksize",
-        "(vector ), the unpooling window size(height, width) "
+        "(vector), the unpooling window size(height, width) "
         "of unpooling operator.");
     AddAttr<std::vector<int>>("strides",
         "(vector, default:{1, 1}), "
-        "strides(height, width) of unpooling operator.")
+        "strides (height, width) of unpooling operator.")
         .SetDefault({1, 1});
     AddAttr<std::vector<int>>("paddings",
         "(vector defalut:{0,0}), "
-        "paddings(height, width) of unpooling operator.")
+        "paddings (height, width) of unpooling operator.")
         .SetDefault({0, 0});
     AddAttr<std::string>("unpoolingtype",
         "(string), unpooling type, can be \"max\" for max-unpooling ")
         .InEnum({"max"});
     AddComment(R"DOC(
-          "input: the input Tensor to invert"
-          "indices: the indices given out by MaxPool2d"
-          "ksize  – Size of the max pooling window."
-          "stride – Stride of the max pooling window."
-                   "It is set to kernel_size by default."
-          "padding – Padding that was added to the input"
+          "input: the input Tensor to invert
+          indices: the indices given out by MaxPool2d
+          ksize  – Size of the max pooling window.
+          stride – Stride of the max pooling window.
+                   "It is set to kernel_size by default.
+          padding – Padding that was added to the input"
         )DOC");
   }
 };
@@ -80,14 +78,14 @@ class UnpoolOp : public framework::OperatorWithKernel {
 
     auto in_x_dims = ctx->GetInputDim("X");
     auto in_y_dims = ctx->GetInputDim("Y");
-    std::string unpoolingtype =  \
+    std::string unpoolingtype =
       ctx->Attrs().Get<std::string>("unpoolingtype");
     std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
     PADDLE_ENFORCE(in_x_dims.size() == 4,
-                    "Unpooling intput should be 4-D.");
+                    "Unpooling intput must be of 4-dimensional.");
     for (int i = 0; i < 4; ++i) {
       PADDLE_ENFORCE(in_x_dims[i] == in_y_dims[i],
                      "X size must be eq Y size!");
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index e3a45ff9a7..e22171649e 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -21,15 +21,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-
 template <typename Place, typename T>
 class UnpoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* in_y = context.Input<Tensor>("Y");
-    auto * out = context.Output<Tensor>("Out");
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Y");
+    auto * out = context.Output<framework::Tensor>("Out");
     std::string unpoolingtype = context.Attr<std::string>("unpoolingtype");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
@@ -39,15 +37,8 @@ class UnpoolKernel : public framework::OpKernel<T> {
       math::SetConstant<Place, T> set_zero;
       set_zero(context.device_context(), out, static_cast<T>(0));
     }
-    switch (ksize.size()) {
-    case 2: {
-      if (unpoolingtype == "max") {
-        math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
-        unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
-      }
-    } break;
-    default: { PADDLE_THROW("Pool op only supports 2D input."); }
-    }
+    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
+    unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
   }
 };
 
@@ -55,12 +46,13 @@ template <typename Place, typename T>
 class UnpoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* in_y = context.Input<Tensor>("Y");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Y");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+      context.Output<framework::Tensor>(framework::GradVarName("X"));
     std::string unpoolingtype = context.Attr<std::string>("unpoolingtype");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
@@ -70,18 +62,11 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     math::SetConstant<Place, T> zero;
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-    }
-    switch (ksize.size()) {
-    case 2: {
-    if (unpoolingtype == "max") {
-      math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
-      unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad,
-                            *out, *out_grad);
-      }
-    } break;
-    default: { PADDLE_THROW("Unpool op only supports 2D input."); }
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
     }
+    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
+    unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad,
+                          *out, *out_grad);
   }
 };
 

From c218961a6b9d8603e881fde0bc87e2cf058d5c7a Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Sun, 26 Nov 2017 21:22:33 +0800
Subject: [PATCH 12/36] modify for code review by qingqing

---
 paddle/operators/math/unpooling.cc | 4 ++--
 paddle/operators/math/unpooling.cu | 4 ++--
 paddle/operators/math/unpooling.h  | 6 ++----
 paddle/operators/unpool_op.cc      | 2 +-
 4 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index a3a24a6892..d8647c6b23 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -60,9 +60,9 @@ public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
-                  framework::Tensor * input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad) {
+                  const framework::Tensor& output_grad,
+                  framework::Tensor * input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index 358847b315..d3eaa48547 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -114,9 +114,9 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
-                  framework::Tensor * input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad) {
+                  const framework::Tensor& output_grad,
+                  framework::Tensor * input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index 88e88ba117..bf79354ed9 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
@@ -37,9 +35,9 @@ class Unpool2dMaxGradFunctor {
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
-                  framework::Tensor * input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad);
+                  const framework::Tensor& output_grad,
+                  framework::Tensor * input_grad);
 };
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index b5f3d56e96..ada9ce8ce5 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -78,7 +78,7 @@ class UnpoolOp : public framework::OperatorWithKernel {
 
     auto in_x_dims = ctx->GetInputDim("X");
     auto in_y_dims = ctx->GetInputDim("Y");
-    std::string unpoolingtype =
+    std::string unpooling_type =
       ctx->Attrs().Get<std::string>("unpoolingtype");
     std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");

From cfd7721b51c2009bfbc9049d25da5eab6aa29745 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Mon, 27 Nov 2017 11:13:07 +0800
Subject: [PATCH 13/36] add unpool_op.h modify

---
 paddle/operators/unpool_op.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index e22171649e..ae11a9f4f8 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -28,7 +28,7 @@ class UnpoolKernel : public framework::OpKernel<T> {
     const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
     const framework::Tensor* in_y = context.Input<framework::Tensor>("Y");
     auto * out = context.Output<framework::Tensor>("Out");
-    std::string unpoolingtype = context.Attr<std::string>("unpoolingtype");
+    std::string unpooling_type = context.Attr<std::string>("unpoolingtype");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
@@ -53,7 +53,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     framework::Tensor* in_x_grad =
       context.Output<framework::Tensor>(framework::GradVarName("X"));
-    std::string unpoolingtype = context.Attr<std::string>("unpoolingtype");
+    std::string unpooling_type = context.Attr<std::string>("unpoolingtype");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
@@ -65,8 +65,8 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
       zero(device_ctx, in_x_grad, static_cast<T>(0));
     }
     math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
-    unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad,
-                          *out, *out_grad);
+    unpool2d_max_backward(context.device_context(), *in_x, *in_y,
+                          *out, *out_grad, in_x_grad);
   }
 };
 

From 20654cf78a051a5079c68de7f7ff69239b063ba8 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Mon, 27 Nov 2017 14:54:39 +0800
Subject: [PATCH 14/36] modify for type check rewrite

---
 paddle/operators/math/unpooling.cc            | 20 ++++++------
 paddle/operators/math/unpooling.cu            | 32 +++++++++----------
 paddle/operators/math/unpooling.h             |  4 +--
 paddle/operators/unpool_op.cc                 | 26 ++++++++++++---
 paddle/operators/unpool_op.cu.cc              |  8 ++---
 paddle/operators/unpool_op.h                  |  8 ++---
 .../paddle/v2/fluid/tests/test_unpool_op.py   |  2 +-
 7 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index d8647c6b23..ab6212f387 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -19,8 +19,8 @@ namespace operators {
 namespace math {
 
 // All tensors are in NCHW format
-template <typename T>
-class Unpool2dMaxFunctor<platform::CPUPlace, T> {
+template <typename T, typename T2>
+class Unpool2dMaxFunctor<platform::CPUPlace, T, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -35,7 +35,7 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
     const T* input_data = input.data<T>();
-    const T * indices_data = indices.data<T>();
+    const T2 * indices_data = indices.data<T2>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
@@ -54,8 +54,8 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
 
 
 
-template <class T>
-class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
+template <class T, typename T2>
+class Unpool2dMaxGradFunctor<platform::CPUPlace, T, T2> {
 public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -71,7 +71,7 @@ public:
     const int output_width = output.dims()[3];
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
-    const T* indices_data = indices.data<T>();
+    const T2 * indices_data = indices.data<T2>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
@@ -90,10 +90,10 @@ public:
   }
 };
 
-template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
-template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
-template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
-template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, float, int>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, double, int>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, float, int>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, double, int>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index d3eaa48547..c8fd58eca5 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -19,10 +19,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T>
+template <typename T, typename T2>
 __global__ void KernelUnpool2dMax(const int nthreads,
                                   const T* input_data,
-                                  const T* indices_data,
+                                  const T2 * indices_data,
                                   const int input_height,
                                   const int input_width,
                                   const int channels,
@@ -45,10 +45,10 @@ __global__ void KernelUnpool2dMax(const int nthreads,
       output_data[out_offset + out_index] = input_data[i];
     }
 }
-template <typename T>
+template <typename T, typename T2>
 __global__ void KernelUnpool2dMaxGrad(const int nthreads,
                                       const T* input_data,
-                                      const T* indices_data,
+                                      const T2* indices_data,
                                       const int input_height,
                                       const int input_width,
                                       const int channels,
@@ -76,8 +76,8 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads,
 /*
  * All tensors are in NCHW format.
  */
-template <typename T>
-class Unpool2dMaxFunctor<platform::GPUPlace, T> {
+template <typename T, typename T2>
+class Unpool2dMaxFunctor<platform::GPUPlace, T, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -90,7 +90,7 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
     const int output_height = output->dims()[2];
     const int output_width = output->dims()[3];
     const T* input_data = input.data<T>();
-    const T* indices_data = indices.data<T>();
+    const T2 * indices_data = indices.data<T2>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
     int nthreads = batch_size * output_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -98,7 +98,7 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
     dim3 grid(blocks, 1);
 
     KernelUnpool2dMax<
-        T><<<grid, threads, 0,
+        T, T2><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(nthreads, input_data, indices_data,
                               input_height, input_width, output_channels,
@@ -108,8 +108,8 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
 /*
  * All tensors are in NCHW format.
  */
-template <typename T>
-class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
+template <typename T, typename T2>
+class Unpool2dMaxGradFunctor<platform::GPUPlace, T, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -124,7 +124,7 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
     const int output_height = output.dims()[2];
     const int output_width = output.dims()[3];
     const T* input_data = input.data<T>();
-    const T* indices_data = indices.data<T>();
+    const T2 * indices_data = indices.data<T2>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
@@ -134,7 +134,7 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
     dim3 grid(blocks, 1);
 
     KernelUnpool2dMaxGrad<
-        T><<<grid, threads, 0,
+        T, T2><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
                               nthreads, input_data, indices_data,
@@ -145,11 +145,11 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
-template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, float, int>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, double, int>;
 
-template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
-template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, float, int>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, double, int>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index bf79354ed9..e086b891a1 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename Place, typename T>
+template <typename Place, typename T, typename T2>
 
 class Unpool2dMaxFunctor {
  public:
@@ -29,7 +29,7 @@ class Unpool2dMaxFunctor {
                   framework::Tensor * output);
 };
 
-template <typename Place, class T>
+template <typename Place, class T, typename T2>
 class Unpool2dMaxGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index ada9ce8ce5..f00459cd85 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -66,7 +66,15 @@ int OutputSize(int input_size, int ksize, int padding, int stride) {
 }
 
 class UnpoolOp : public framework::OperatorWithKernel {
- public:
+protected:
+  framework::OpKernelType GetKernelType(
+    const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+      ctx.device_context());
+  }
+
+public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp"
@@ -102,6 +110,14 @@ class UnpoolOp : public framework::OperatorWithKernel {
 };
 
 class UnpoolOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+    const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+      ctx.device_context());
+  }
+
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
@@ -118,9 +134,9 @@ namespace ops = paddle::operators;
 REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
             ops::UnpoolOpGrad);
 REGISTER_OP_CPU_KERNEL(unpool,
-                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
+              ops::UnpoolKernel<paddle::platform::CPUPlace, float, int>,
+              ops::UnpoolKernel<paddle::platform::CPUPlace, double, int>);
 REGISTER_OP_CPU_KERNEL(unpool_grad,
-                    ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
-                    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
+            ops::UnpoolGradKernel<paddle::platform::CPUPlace, float, int>,
+            ops::UnpoolGradKernel<paddle::platform::CPUPlace, double, int>);
 
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
index 4949fc467e..0a1d8b5996 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
@@ -16,10 +16,10 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(unpool,
-                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
-                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
+                ops::UnpoolKernel<paddle::platform::GPUPlace, float, int>,
+                ops::UnpoolKernel<paddle::platform::GPUPlace, double, int>);
 REGISTER_OP_GPU_KERNEL(unpool_grad,
                        ops::UnpoolGradKernel<paddle::platform::GPUPlace,
-                        float>,
+                        float, int>,
                        ops::UnpoolGradKernel<paddle::platform::GPUPlace,
-                        double>);
+                        double, int>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index ae11a9f4f8..c294221181 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename Place, typename T, typename T2>
 class UnpoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -37,12 +37,12 @@ class UnpoolKernel : public framework::OpKernel<T> {
       math::SetConstant<Place, T> set_zero;
       set_zero(context.device_context(), out, static_cast<T>(0));
     }
-    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
+    math::Unpool2dMaxFunctor<Place, T, T2> unpool2d_max_forward;
     unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
   }
 };
 
-template <typename Place, typename T>
+template <typename Place, typename T, typename T2>
 class UnpoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -64,7 +64,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
       in_x_grad->mutable_data<T>(context.GetPlace());
       zero(device_ctx, in_x_grad, static_cast<T>(0));
     }
-    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
+    math::Unpool2dMaxGradFunctor<Place, T, T2> unpool2d_max_backward;
     unpool2d_max_backward(context.device_context(), *in_x, *in_y,
                           *out, *out_grad, in_x_grad);
   }
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
index 106af9f5d9..3fdee9091f 100644
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -53,7 +53,7 @@ class TestUnpoolOp(OpTest):
         output = self.Unpool2d_forward_naive(input, indices, self.ksize, \
                 self.strides, self.paddings).astype("float32")
         self.inputs = {'X': input.astype('float32'),
-                       'Y': indices.astype('int16')}
+                       'Y': indices.astype('int32')}
         self.attrs = {
                  'strides': self.strides,
                  'paddings': self.paddings,

From f9c2a5c38e3800387aaedcc05bf0e49d0f568a65 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Mon, 27 Nov 2017 15:56:45 +0800
Subject: [PATCH 15/36] modify for code review zcd

---
 paddle/operators/unpool_op.cc                  | 4 ++--
 paddle/operators/unpool_op.h                   | 4 ++--
 python/paddle/v2/fluid/tests/test_unpool_op.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index f00459cd85..addceca159 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -46,7 +46,7 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "(vector defalut:{0,0}), "
         "paddings (height, width) of unpooling operator.")
         .SetDefault({0, 0});
-    AddAttr<std::string>("unpoolingtype",
+    AddAttr<std::string>("unpooling_type",
         "(string), unpooling type, can be \"max\" for max-unpooling ")
         .InEnum({"max"});
     AddComment(R"DOC(
@@ -87,7 +87,7 @@ public:
     auto in_x_dims = ctx->GetInputDim("X");
     auto in_y_dims = ctx->GetInputDim("Y");
     std::string unpooling_type =
-      ctx->Attrs().Get<std::string>("unpoolingtype");
+      ctx->Attrs().Get<std::string>("unpooling_type");
     std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index c294221181..f05d22b49f 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -28,7 +28,7 @@ class UnpoolKernel : public framework::OpKernel<T> {
     const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
     const framework::Tensor* in_y = context.Input<framework::Tensor>("Y");
     auto * out = context.Output<framework::Tensor>("Out");
-    std::string unpooling_type = context.Attr<std::string>("unpoolingtype");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
@@ -53,7 +53,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     framework::Tensor* in_x_grad =
       context.Output<framework::Tensor>(framework::GradVarName("X"));
-    std::string unpooling_type = context.Attr<std::string>("unpoolingtype");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
index 3fdee9091f..22826dc1b3 100644
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -58,7 +58,7 @@ class TestUnpoolOp(OpTest):
                  'strides': self.strides,
                  'paddings': self.paddings,
                  'ksize': self.ksize,
-                 'unpoolingtype': self.unpoolingtype,
+                 'unpooling_type': self.unpooling_type,
                  }
         self.outputs = {'Out': output.astype('float32')}
 
@@ -70,7 +70,7 @@ class TestUnpoolOp(OpTest):
 
     def init_test_case(self):
         self.Unpool2d_forward_naive = unpool2dmax_forward_naive
-        self.unpoolingtype = "max"
+        self.unpooling_type = "max"
         self.shape = [6, 4, 5, 5]
         self.ksize = [3, 3]
         self.strides = [2, 2]

From 57e68e574026d2853b6fcec069647322959493b7 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Tue, 28 Nov 2017 08:15:33 +0800
Subject: [PATCH 16/36] modify for code review by qingqing 2nd

---
 paddle/operators/math/unpooling.cu            | 48 +++++++++---------
 paddle/operators/unpool_op.cc                 | 49 ++++++++-----------
 paddle/operators/unpool_op.cu.cc              | 18 +++----
 paddle/operators/unpool_op.h                  |  6 +--
 .../paddle/v2/fluid/tests/test_unpool_op.py   | 36 +++++++-------
 5 files changed, 72 insertions(+), 85 deletions(-)

diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index c8fd58eca5..99e6fd052a 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -29,19 +29,19 @@ __global__ void KernelUnpool2dMax(const int nthreads,
                                   T* output_data,
                                   const int output_height,
                                   const int output_width) {
-    int bsize = input_height * input_width * channels;
-    int csize = input_height * input_width;
-    int out_bsize = output_height * output_width * channels;
-    int out_csize = output_height * output_width;
+    int in_n_stride = input_height * input_width * channels;
+    int in_c_stride = input_height * input_width;
+    int out_n_stride = output_height * output_width * channels;
+    int out_c_stride = output_height * output_width;
     int index = blockIdx.x * blockDim.x + threadIdx.x;
     int offset = blockDim.x * gridDim.x;
     for (int i = index; i < nthreads; i += offset) {
-      int bidx = i / bsize;
-      int boffset = i % bsize;
-      int cidx = boffset / csize;
-      int out_offset = bidx * out_bsize + cidx * out_csize;
+      int bidx = i / in_n_stride;
+      int boffset = i % in_n_stride;
+      int cidx = boffset / in_c_stride;
+      int out_offset = bidx * out_n_stride + cidx * out_c_stride;
       int out_index = indices_data[i];
-      PADDLE_ASSERT(out_index < (output_height * output_width));
+      PADDLE_ASSERT(out_index < out_c_stride);
       output_data[out_offset + out_index] = input_data[i];
     }
 }
@@ -57,19 +57,19 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads,
                                       const int output_height,
                                       const int output_width,
                                       T* input_grad) {
-    int bsize = input_height * input_width * channels;
-    int csize = input_height * input_width;
-    int out_bsize = output_height * output_width * channels;
-    int out_csize = output_height * output_width;
+    int in_n_stride = input_height * input_width * channels;
+    int in_c_stride = input_height * input_width;
+    int out_n_stride = output_height * output_width * channels;
+    int out_c_stride = output_height * output_width;
     int index = blockIdx.x * blockDim.x + threadIdx.x;
     int offset = blockDim.x * gridDim.x;
     for (int i = index; i < nthreads; i += offset) {
-      int bidx = i / bsize;
-      int boffset = i % bsize;
-      int cidx = boffset / csize;
-      int out_offset = bidx * out_bsize + cidx * out_csize;
+      int bidx = i / in_n_stride;
+      int boffset = i % in_n_stride;
+      int cidx = boffset / in_c_stride;
+      int out_offset = bidx * out_n_stride + cidx * out_c_stride;
       int out_index = indices_data[i];
-      PADDLE_ASSERT(out_index < (output_height * output_width));
+      PADDLE_ASSERT(out_index < out_c_stride);
       input_grad[i] = output_grad[out_offset + out_index];
     }
 }
@@ -93,10 +93,8 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T, T2> {
     const T2 * indices_data = indices.data<T2>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
     int nthreads = batch_size * output_channels * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
+    int threads = 1024;
+    int grid =  (input.numel() + threads - 1) / threads;
     KernelUnpool2dMax<
         T, T2><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
@@ -129,10 +127,8 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T, T2> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
     int nthreads = batch_size * output_channels * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
+    int threads = 1024;
+    int grid =  (input.numel() + threads - 1) / threads;
     KernelUnpool2dMaxGrad<
         T, T2><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index addceca159..49a5129188 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/unpool_op.h"
 namespace paddle {
@@ -25,7 +25,7 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "(Tensor) The input tensor of unpool operator. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
-    AddInput("Y",
+    AddInput("Indices",
         "(Tensor) The input tensor of the indices given out by MaxPool2d. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
@@ -50,12 +50,10 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "(string), unpooling type, can be \"max\" for max-unpooling ")
         .InEnum({"max"});
     AddComment(R"DOC(
-          "input: the input Tensor to invert
-          indices: the indices given out by MaxPool2d
-          ksize  – Size of the max pooling window.
-          stride – Stride of the max pooling window.
-                   "It is set to kernel_size by default.
-          padding – Padding that was added to the input"
+          "Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
+          /07/iccv2011.pdf
+          PyTorch: http://pytorch.org/docs/master/nn.html?highlight=unpool#
+          torch.nn.MaxUnpool2d"
         )DOC");
   }
 };
@@ -79,27 +77,20 @@ public:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp"
                    "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of UnpoolOp"
+    PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp"
                    "should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of UnpoolOp should not be null.");
-
     auto in_x_dims = ctx->GetInputDim("X");
-    auto in_y_dims = ctx->GetInputDim("Y");
+    auto in_y_dims = ctx->GetInputDim("Indices");
     std::string unpooling_type =
       ctx->Attrs().Get<std::string>("unpooling_type");
     std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-
     PADDLE_ENFORCE(in_x_dims.size() == 4,
                     "Unpooling intput must be of 4-dimensional.");
-    for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE(in_x_dims[i] == in_y_dims[i],
-                     "X size must be eq Y size!");
-    }
-
-
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
     for (size_t i = 0; i < ksize.size(); ++i) {
       output_shape.push_back(
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
index 0a1d8b5996..9b5ac667d3 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/unpool_op.h"
 
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index f05d22b49f..dfd4ef12b5 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -2,7 +2,7 @@
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+Indicesou may obtain a copy of the License at
 
     http://www.apache.org/licenses/LICENSE-2.0
 
@@ -26,7 +26,7 @@ class UnpoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* in_y = context.Input<framework::Tensor>("Y");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
     auto * out = context.Output<framework::Tensor>("Out");
     std::string unpooling_type = context.Attr<std::string>("unpooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
@@ -47,7 +47,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* in_y = context.Input<framework::Tensor>("Y");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
     const framework::Tensor* out = context.Input<framework::Tensor>("Out");
     const framework::Tensor* out_grad =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
index 22826dc1b3..b3c6c85025 100644
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -5,16 +5,16 @@ from op_test import OpTest
 
 def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
     s0, s1, s2, s3 = input.shape
-    out_H=(s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
-    out_W=(s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
-    out = np.zeros((s0, s1, out_H, out_W))
+    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
+    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
+    out = np.zeros((s0, s1, out_hsize, out_wsize))
     for nidx in xrange(s0):
         for cidx in xrange(s1):
             for h in xrange(s2):
                 for w in xrange(s3):
                     index = indices[nidx, cidx, h, w]
-                    hidx = (index - index % out_W) / out_W
-                    widx = index % out_W
+                    hidx = (index - index % out_wsize) / out_wsize
+                    widx = index % out_wsize
                     out[nidx, cidx, int(hidx), int(widx)] = \
                             input[nidx, cidx, h, w]
 
@@ -26,34 +26,34 @@ class TestUnpoolOp(OpTest):
         self.op_type = "unpool"
         self.init_test_case()
         pre_input = np.random.random(self.shape).astype("float32")
-        N, C, H, W = pre_input.shape
-        H_out = (H - self.ksize[0] + 2 * self.paddings[0]) / \
+        nsize, csize, hsize, wsize = pre_input.shape
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
                 self.strides[0] + 1
-        W_out = (W - self.ksize[1] + 2 * self.paddings[1]) / \
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
                 self.strides[1] + 1
-        input = np.zeros((N, C, H_out, W_out))
-        indices = np.zeros((N, C, H_out, W_out))
-        for i in xrange(H_out):
-            for j in xrange(W_out):
+        input = np.zeros((nsize, csize, hsize_out, wsize_out))
+        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
+        for i in xrange(hsize_out):
+            for j in xrange(wsize_out):
                 r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
                 r_end = np.min((i * self.strides[0] + self.ksize[0] - \
-                        self.paddings[0], H))
+                        self.paddings[0], hsize))
                 c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
                 c_end = np.min((j * self.strides[1] + self.ksize[1] - \
-                        self.paddings[1], W))
-                for nidx in xrange(N):
-                    for cidx in xrange(C):
+                        self.paddings[1], wsize))
+                for nidx in xrange(nsize):
+                    for cidx in xrange(csize):
                         x_masked = pre_input[nidx, cidx, r_start:r_end, \
                                 c_start:c_end]
                         input[nidx, cidx, i, j] = x_masked.max()
                         arg = x_masked.argmax()
                         indices[nidx, cidx, i, j] = \
-                                (r_start + arg / self.ksize[1]) * W + \
+                                (r_start + arg / self.ksize[1]) * wsize + \
                                 c_start + arg % self.ksize[1]
         output = self.Unpool2d_forward_naive(input, indices, self.ksize, \
                 self.strides, self.paddings).astype("float32")
         self.inputs = {'X': input.astype('float32'),
-                       'Y': indices.astype('int32')}
+                       'Indices': indices.astype('int32')}
         self.attrs = {
                  'strides': self.strides,
                  'paddings': self.paddings,

From 6fc9a9fd690e2d5fe48f2b39ed2575a04ef32103 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Tue, 28 Nov 2017 23:15:09 +0800
Subject: [PATCH 17/36] modify for del T2 and doc update

---
 paddle/operators/math/unpooling.cc            | 20 +++++-----
 paddle/operators/math/unpooling.cu            | 39 +++++++++----------
 paddle/operators/math/unpooling.h             |  4 +-
 paddle/operators/unpool_op.cc                 | 19 +++++----
 paddle/operators/unpool_op.cu.cc              |  8 ++--
 paddle/operators/unpool_op.h                  |  8 ++--
 .../paddle/v2/fluid/tests/test_unpool_op.py   |  4 +-
 7 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index ab6212f387..dbc3936971 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -19,8 +19,8 @@ namespace operators {
 namespace math {
 
 // All tensors are in NCHW format
-template <typename T, typename T2>
-class Unpool2dMaxFunctor<platform::CPUPlace, T, T2> {
+template <typename T>
+class Unpool2dMaxFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -35,7 +35,7 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T, T2> {
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
     const T* input_data = input.data<T>();
-    const T2 * indices_data = indices.data<T2>();
+    const int * indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
@@ -54,8 +54,8 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T, T2> {
 
 
 
-template <class T, typename T2>
-class Unpool2dMaxGradFunctor<platform::CPUPlace, T, T2> {
+template <class T>
+class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
 public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -71,7 +71,7 @@ public:
     const int output_width = output.dims()[3];
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
-    const T2 * indices_data = indices.data<T2>();
+    const int * indices_data = indices.data<int>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
@@ -90,10 +90,10 @@ public:
   }
 };
 
-template class Unpool2dMaxGradFunctor<platform::CPUPlace, float, int>;
-template class Unpool2dMaxGradFunctor<platform::CPUPlace, double, int>;
-template class Unpool2dMaxFunctor<platform::CPUPlace, float, int>;
-template class Unpool2dMaxFunctor<platform::CPUPlace, double, int>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index 99e6fd052a..9cdd61f6d5 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -19,10 +19,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T, typename T2>
+template <typename T>
 __global__ void KernelUnpool2dMax(const int nthreads,
                                   const T* input_data,
-                                  const T2 * indices_data,
+                                  const int * indices_data,
                                   const int input_height,
                                   const int input_width,
                                   const int channels,
@@ -45,10 +45,10 @@ __global__ void KernelUnpool2dMax(const int nthreads,
       output_data[out_offset + out_index] = input_data[i];
     }
 }
-template <typename T, typename T2>
+template <typename T>
 __global__ void KernelUnpool2dMaxGrad(const int nthreads,
                                       const T* input_data,
-                                      const T2* indices_data,
+                                      const int* indices_data,
                                       const int input_height,
                                       const int input_width,
                                       const int channels,
@@ -76,8 +76,8 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads,
 /*
  * All tensors are in NCHW format.
  */
-template <typename T, typename T2>
-class Unpool2dMaxFunctor<platform::GPUPlace, T, T2> {
+template <typename T>
+class Unpool2dMaxFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -90,15 +90,14 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T, T2> {
     const int output_height = output->dims()[2];
     const int output_width = output->dims()[3];
     const T* input_data = input.data<T>();
-    const T2 * indices_data = indices.data<T2>();
+    const int * indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads = batch_size * output_channels * input_height * input_width;
     int threads = 1024;
     int grid =  (input.numel() + threads - 1) / threads;
     KernelUnpool2dMax<
-        T, T2><<<grid, threads, 0,
+        T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, indices_data,
+                 .stream()>>>(input.numel(), input_data, indices_data,
                               input_height, input_width, output_channels,
                               output_data, output_height, output_width);
   }
@@ -106,8 +105,8 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T, T2> {
 /*
  * All tensors are in NCHW format.
  */
-template <typename T, typename T2>
-class Unpool2dMaxGradFunctor<platform::GPUPlace, T, T2> {
+template <typename T>
+class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
@@ -122,18 +121,16 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T, T2> {
     const int output_height = output.dims()[2];
     const int output_width = output.dims()[3];
     const T* input_data = input.data<T>();
-    const T2 * indices_data = indices.data<T2>();
+    const int * indices_data = indices.data<int>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads = batch_size * output_channels * input_height * input_width;
     int threads = 1024;
     int grid =  (input.numel() + threads - 1) / threads;
     KernelUnpool2dMaxGrad<
-        T, T2><<<grid, threads, 0,
+        T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
-                              nthreads, input_data, indices_data,
+                 .stream()>>>(input.numel(), input_data, indices_data,
                               input_height, input_width, output_channels,
                               output_data, output_grad_data,
                               output_height, output_width,
@@ -141,11 +138,11 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T, T2> {
   }
 };
 
-template class Unpool2dMaxGradFunctor<platform::GPUPlace, float, int>;
-template class Unpool2dMaxGradFunctor<platform::GPUPlace, double, int>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
 
-template class Unpool2dMaxFunctor<platform::GPUPlace, float, int>;
-template class Unpool2dMaxFunctor<platform::GPUPlace, double, int>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index e086b891a1..bf79354ed9 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename Place, typename T, typename T2>
+template <typename Place, typename T>
 
 class Unpool2dMaxFunctor {
  public:
@@ -29,7 +29,7 @@ class Unpool2dMaxFunctor {
                   framework::Tensor * output);
 };
 
-template <typename Place, class T, typename T2>
+template <typename Place, class T>
 class Unpool2dMaxGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index 49a5129188..2505148764 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -50,10 +50,15 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "(string), unpooling type, can be \"max\" for max-unpooling ")
         .InEnum({"max"});
     AddComment(R"DOC(
-          "Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
+          "Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Where
+          $$
+            H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+            W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+          $$
+          Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
           /07/iccv2011.pdf
-          PyTorch: http://pytorch.org/docs/master/nn.html?highlight=unpool#
-          torch.nn.MaxUnpool2d"
         )DOC");
   }
 };
@@ -125,9 +130,9 @@ namespace ops = paddle::operators;
 REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
             ops::UnpoolOpGrad);
 REGISTER_OP_CPU_KERNEL(unpool,
-              ops::UnpoolKernel<paddle::platform::CPUPlace, float, int>,
-              ops::UnpoolKernel<paddle::platform::CPUPlace, double, int>);
+              ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+              ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(unpool_grad,
-            ops::UnpoolGradKernel<paddle::platform::CPUPlace, float, int>,
-            ops::UnpoolGradKernel<paddle::platform::CPUPlace, double, int>);
+            ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
+            ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
 
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
index 9b5ac667d3..d8214fc687 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
@@ -16,10 +16,10 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(unpool,
-                ops::UnpoolKernel<paddle::platform::GPUPlace, float, int>,
-                ops::UnpoolKernel<paddle::platform::GPUPlace, double, int>);
+                ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+                ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(unpool_grad,
                        ops::UnpoolGradKernel<paddle::platform::GPUPlace,
-                        float, int>,
+                        float>,
                        ops::UnpoolGradKernel<paddle::platform::GPUPlace,
-                        double, int>);
+                        double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index dfd4ef12b5..f618a7c0ba 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T, typename T2>
+template <typename Place, typename T>
 class UnpoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -37,12 +37,12 @@ class UnpoolKernel : public framework::OpKernel<T> {
       math::SetConstant<Place, T> set_zero;
       set_zero(context.device_context(), out, static_cast<T>(0));
     }
-    math::Unpool2dMaxFunctor<Place, T, T2> unpool2d_max_forward;
+    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
     unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
   }
 };
 
-template <typename Place, typename T, typename T2>
+template <typename Place, typename T>
 class UnpoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -64,7 +64,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
       in_x_grad->mutable_data<T>(context.GetPlace());
       zero(device_ctx, in_x_grad, static_cast<T>(0));
     }
-    math::Unpool2dMaxGradFunctor<Place, T, T2> unpool2d_max_backward;
+    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
     unpool2d_max_backward(context.device_context(), *in_x, *in_y,
                           *out, *out_grad, in_x_grad);
   }
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
index b3c6c85025..292b9bc14a 100644
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -50,7 +50,7 @@ class TestUnpoolOp(OpTest):
                         indices[nidx, cidx, i, j] = \
                                 (r_start + arg / self.ksize[1]) * wsize + \
                                 c_start + arg % self.ksize[1]
-        output = self.Unpool2d_forward_naive(input, indices, self.ksize, \
+        output = self.unpool2d_forward_naive(input, indices, self.ksize, \
                 self.strides, self.paddings).astype("float32")
         self.inputs = {'X': input.astype('float32'),
                        'Indices': indices.astype('int32')}
@@ -69,7 +69,7 @@ class TestUnpoolOp(OpTest):
         self.check_grad(['X'], 'Out')
 
     def init_test_case(self):
-        self.Unpool2d_forward_naive = unpool2dmax_forward_naive
+        self.unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpooling_type = "max"
         self.shape = [6, 4, 5, 5]
         self.ksize = [3, 3]

From d9673cad153be572dbf356733f4e9ae6df1d56d2 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 29 Nov 2017 07:44:52 +0800
Subject: [PATCH 18/36] format code

---
 paddle/operators/math/unpooling.cc | 4 ++--
 paddle/operators/math/unpooling.cu | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index dbc3936971..35091e849b 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -35,7 +35,7 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
     const T* input_data = input.data<T>();
-    const int * indices_data = indices.data<int>();
+    const int* indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
@@ -71,7 +71,7 @@ public:
     const int output_width = output.dims()[3];
     int input_feasize = input_height * input_width;
     int output_feasize = output_height * output_width;
-    const int * indices_data = indices.data<int>();
+    const int* indices_data = indices.data<int>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index 9cdd61f6d5..9f27e35d00 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -90,7 +90,7 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
     const int output_height = output->dims()[2];
     const int output_width = output->dims()[3];
     const T* input_data = input.data<T>();
-    const int * indices_data = indices.data<int>();
+    const int* indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
     int threads = 1024;
     int grid =  (input.numel() + threads - 1) / threads;
@@ -121,7 +121,7 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
     const int output_height = output.dims()[2];
     const int output_width = output.dims()[3];
     const T* input_data = input.data<T>();
-    const int * indices_data = indices.data<int>();
+    const int* indices_data = indices.data<int>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

From bd561384bca825088417fbd69dc2282b7581bf3c Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 29 Nov 2017 08:23:50 +0800
Subject: [PATCH 19/36] format code

---
 paddle/operators/math/unpooling.cc            |  17 +--
 paddle/operators/math/unpooling.cu            |  87 ++++++------
 paddle/operators/math/unpooling.h             |   9 +-
 paddle/operators/unpool_op.cc                 | 134 ++++++++++--------
 paddle/operators/unpool_op.h                  |   8 +-
 .../paddle/v2/fluid/tests/test_unpool_op.py   |  18 +--
 6 files changed, 133 insertions(+), 140 deletions(-)

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index 35091e849b..b13d0104de 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -17,15 +17,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
-
 // All tensors are in NCHW format
 template <typename T>
 class Unpool2dMaxFunctor<platform::CPUPlace, T> {
- public:
+public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  framework::Tensor * output) {
+              const framework::Tensor& input,
+              const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -40,7 +38,7 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
-          int index =  indices_data[i];
+          int index = indices_data[i];
           PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
           output_data[index] = input_data[i];
         }
@@ -51,9 +49,6 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
     }
   }
 };
-
-
-
 template <class T>
 class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
 public:
@@ -62,7 +57,7 @@ public:
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad,
-                  framework::Tensor * input_grad) {
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -89,12 +84,10 @@ public:
     }
   }
 };
-
 template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
 template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
 template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
 template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index 9f27e35d00..6017920873 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -18,36 +18,33 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
-
 template <typename T>
-__global__ void KernelUnpool2dMax(const int nthreads,
-                                  const T* input_data,
-                                  const int * indices_data,
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
                                   const int input_height,
                                   const int input_width,
                                   const int channels,
                                   T* output_data,
                                   const int output_height,
                                   const int output_width) {
-    int in_n_stride = input_height * input_width * channels;
-    int in_c_stride = input_height * input_width;
-    int out_n_stride = output_height * output_width * channels;
-    int out_c_stride = output_height * output_width;
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (int i = index; i < nthreads; i += offset) {
-      int bidx = i / in_n_stride;
-      int boffset = i % in_n_stride;
-      int cidx = boffset / in_c_stride;
-      int out_offset = bidx * out_n_stride + cidx * out_c_stride;
-      int out_index = indices_data[i];
-      PADDLE_ASSERT(out_index < out_c_stride);
-      output_data[out_offset + out_index] = input_data[i];
-    }
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    output_data[out_offset + out_index] = input_data[i];
+  }
 }
 template <typename T>
-__global__ void KernelUnpool2dMaxGrad(const int nthreads,
-                                      const T* input_data,
+__global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data,
                                       const int* indices_data,
                                       const int input_height,
                                       const int input_width,
@@ -57,32 +54,32 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads,
                                       const int output_height,
                                       const int output_width,
                                       T* input_grad) {
-    int in_n_stride = input_height * input_width * channels;
-    int in_c_stride = input_height * input_width;
-    int out_n_stride = output_height * output_width * channels;
-    int out_c_stride = output_height * output_width;
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (int i = index; i < nthreads; i += offset) {
-      int bidx = i / in_n_stride;
-      int boffset = i % in_n_stride;
-      int cidx = boffset / in_c_stride;
-      int out_offset = bidx * out_n_stride + cidx * out_c_stride;
-      int out_index = indices_data[i];
-      PADDLE_ASSERT(out_index < out_c_stride);
-      input_grad[i] = output_grad[out_offset + out_index];
-    }
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    input_grad[i] = output_grad[out_offset + out_index];
+  }
 }
 /*
  * All tensors are in NCHW format.
  */
 template <typename T>
 class Unpool2dMaxFunctor<platform::GPUPlace, T> {
- public:
+public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
-                  framework::Tensor * output) {
+                  framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -93,7 +90,7 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
     const int* indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
     int threads = 1024;
-    int grid =  (input.numel() + threads - 1) / threads;
+    int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMax<
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
@@ -107,13 +104,13 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
  */
 template <typename T>
 class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
- public:
+public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad,
-                  framework::Tensor * input_grad) {
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -126,24 +123,20 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
     int threads = 1024;
-    int grid =  (input.numel() + threads - 1) / threads;
+    int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMaxGrad<
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(input.numel(), input_data, indices_data,
                               input_height, input_width, output_channels,
                               output_data, output_grad_data,
-                              output_height, output_width,
-                              input_grad_data);
+                              output_height, output_width, input_grad_data);
   }
 };
-
 template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
 template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
-
 template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
 template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index bf79354ed9..0b969d8d82 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -22,22 +22,21 @@ namespace math {
 template <typename Place, typename T>
 
 class Unpool2dMaxFunctor {
- public:
+public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  framework::Tensor * output);
+                  const framework::Tensor& indices, framework::Tensor* output);
 };
 
 template <typename Place, class T>
 class Unpool2dMaxGradFunctor {
- public:
+public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad,
-                  framework::Tensor * input_grad);
+                  framework::Tensor* input_grad);
 };
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index 2505148764..cabf17401b 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -21,107 +21,115 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
   Unpool2dOpMaker(framework::OpProto* proto,
                   framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
+    AddInput(
+        "X",
         "(Tensor) The input tensor of unpool operator. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
-    AddInput("Indices",
+    AddInput(
+        "Indices",
         "(Tensor) The input tensor of the indices given out by MaxPool2d. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
-    AddOutput("Out",
+    AddOutput(
+        "Out",
         "(Tensor) The output tensor of unpool operator."
         "The format of output tensor is also NCHW."
         "Where N is batch size, C is "
         "the number of channels, H and W is the height and "
         "width of feature.");
-    AddAttr<std::vector<int>>("ksize",
+    AddAttr<std::vector<int>>(
+        "ksize",
         "(vector), the unpooling window size(height, width) "
         "of unpooling operator.");
-    AddAttr<std::vector<int>>("strides",
+    AddAttr<std::vector<int>>(
+        "strides",
         "(vector, default:{1, 1}), "
         "strides (height, width) of unpooling operator.")
         .SetDefault({1, 1});
-    AddAttr<std::vector<int>>("paddings",
+    AddAttr<std::vector<int>>(
+        "paddings",
         "(vector defalut:{0,0}), "
         "paddings (height, width) of unpooling operator.")
         .SetDefault({0, 0});
-    AddAttr<std::string>("unpooling_type",
+    AddAttr<std::string>(
+        "unpooling_type",
         "(string), unpooling type, can be \"max\" for max-unpooling ")
         .InEnum({"max"});
     AddComment(R"DOC(
-          "Input shape: $(N, C_{in}, H_{in}, W_{in})$
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
-          Where
+        "Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
           $$
             H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
             W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
           $$
-          Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
-          /07/iccv2011.pdf
+        Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
+        /07/iccv2011.pdf
         )DOC");
   }
 };
 
 int OutputSize(int input_size, int ksize, int padding, int stride) {
-  int output_size = (input_size -1) * stride - 2 * padding + ksize;
+  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
   return output_size;
 }
 
 class UnpoolOp : public framework::OperatorWithKernel {
-protected:
-  framework::OpKernelType GetKernelType(
-    const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-      ctx.device_context());
-  }
+  protected:
+    framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+      return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+    }
 
-public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp"
+  public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    void InferShape(framework::InferShapeContext* ctx) const override {
+      PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp"
+                     "should not be null.");
+      PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp"
                    "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+      PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of UnpoolOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
-    auto in_y_dims = ctx->GetInputDim("Indices");
-    std::string unpooling_type =
-      ctx->Attrs().Get<std::string>("unpooling_type");
-    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    PADDLE_ENFORCE(in_x_dims.size() == 4,
-                    "Unpooling intput must be of 4-dimensional.");
-    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      output_shape.push_back(
-        OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  }
+      auto in_x_dims = ctx->GetInputDim("X");
+      auto in_y_dims = ctx->GetInputDim("Indices");
+      std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+      std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+      std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+      std::vector<int> paddings =
+        ctx->Attrs().Get<std::vector<int>>("paddings");
+      PADDLE_ENFORCE(in_x_dims.size() == 4,
+                      "Unpooling intput must be of 4-dimensional.");
+      PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+      std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        output_shape.push_back(
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+      }
+      ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+   }
 };
 
 class UnpoolOpGrad : public framework::OperatorWithKernel {
- protected:
-  framework::OpKernelType GetKernelType(
-    const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-      ctx.device_context());
+  protected:
+    framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+        return framework::OpKernelType(
+          framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+          ctx.device_context());
   }
 
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+  public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    void InferShape(framework::InferShapeContext* ctx) const override {
+      PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+   }
 };
 }    // namespace operators
 }    // namespace paddle
@@ -129,10 +137,10 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
             ops::UnpoolOpGrad);
-REGISTER_OP_CPU_KERNEL(unpool,
-              ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
-              ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(unpool_grad,
-            ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
-            ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+  unpool,ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+  ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+  unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
+  ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
 
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index f618a7c0ba..8fad768e49 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -27,7 +27,7 @@ class UnpoolKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
     const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
-    auto * out = context.Output<framework::Tensor>("Out");
+    auto* out = context.Output<framework::Tensor>("Out");
     std::string unpooling_type = context.Attr<std::string>("unpooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
@@ -52,7 +52,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     const framework::Tensor* out_grad =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     framework::Tensor* in_x_grad =
-      context.Output<framework::Tensor>(framework::GradVarName("X"));
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
     std::string unpooling_type = context.Attr<std::string>("unpooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
@@ -65,8 +65,8 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
       zero(device_ctx, in_x_grad, static_cast<T>(0));
     }
     math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
-    unpool2d_max_backward(context.device_context(), *in_x, *in_y,
-                          *out, *out_grad, in_x_grad);
+    unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
+                          *out_grad, in_x_grad);
   }
 };
 
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
index 292b9bc14a..321cd9fab8 100644
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -52,14 +52,16 @@ class TestUnpoolOp(OpTest):
                                 c_start + arg % self.ksize[1]
         output = self.unpool2d_forward_naive(input, indices, self.ksize, \
                 self.strides, self.paddings).astype("float32")
-        self.inputs = {'X': input.astype('float32'),
-                       'Indices': indices.astype('int32')}
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Indices': indices.astype('int32')
+            }
         self.attrs = {
-                 'strides': self.strides,
-                 'paddings': self.paddings,
-                 'ksize': self.ksize,
-                 'unpooling_type': self.unpooling_type,
-                 }
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+            }
         self.outputs = {'Out': output.astype('float32')}
 
     def test_check_output(self):
@@ -76,7 +78,5 @@ class TestUnpoolOp(OpTest):
         self.strides = [2, 2]
         self.paddings = [0, 0]
 
-
-
 if __name__ == '__main__':
     unittest.main()

From c52ed8de37b922b8cc5d9ab1a4ff34a426667ed6 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 29 Nov 2017 10:57:55 +0800
Subject: [PATCH 20/36] format code

---
 paddle/operators/math/unpooling.cc | 22 +++++------
 paddle/operators/math/unpooling.cu | 59 ++++++++++++------------------
 paddle/operators/math/unpooling.h  | 23 +++++-------
 paddle/operators/unpool_op.cc      | 15 ++++----
 paddle/operators/unpool_op.cu.cc   | 14 +++----
 paddle/operators/unpool_op.h       |  3 --
 6 files changed, 54 insertions(+), 82 deletions(-)

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index b13d0104de..71928314ba 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -13,17 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/math/unpooling.h"
-
 namespace paddle {
 namespace operators {
 namespace math {
-// All tensors are in NCHW format
 template <typename T>
 class Unpool2dMaxFunctor<platform::CPUPlace, T> {
-public:
-  void operator()(const platform::DeviceContext& context,
-              const framework::Tensor& input,
-              const framework::Tensor& indices, framework::Tensor* output) {
+ public:
+  void operator()(
+    const platform::DeviceContext& context, const framework::Tensor& input,
+    const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -51,13 +49,11 @@ public:
 };
 template <class T>
 class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
-public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad) {
+ public:
+  void operator()(
+    const platform::DeviceContext& context, const framework::Tensor& input,
+    const framework::Tensor& indices, const framework::Tensor& output,
+    const framework::Tensor& output_grad, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index 6017920873..4c6cb7bbca 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -19,14 +19,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
-                                  const int* indices_data,
-                                  const int input_height,
-                                  const int input_width,
-                                  const int channels,
-                                  T* output_data,
-                                  const int output_height,
-                                  const int output_width) {
+__global__ void KernelUnpool2dMax(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    T* output_data, const int output_height, const int output_width) {
   int in_n_stride = input_height * input_width * channels;
   int in_c_stride = input_height * input_width;
   int out_n_stride = output_height * output_width * channels;
@@ -44,16 +40,11 @@ __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
   }
 }
 template <typename T>
-__global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data,
-                                      const int* indices_data,
-                                      const int input_height,
-                                      const int input_width,
-                                      const int channels,
-                                      const T* output_data,
-                                      const T* output_grad,
-                                      const int output_height,
-                                      const int output_width,
-                                      T* input_grad) {
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
   int in_n_stride = input_height * input_width * channels;
   int in_c_stride = input_height * input_width;
   int out_n_stride = output_height * output_width * channels;
@@ -75,11 +66,10 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data,
  */
 template <typename T>
 class Unpool2dMaxFunctor<platform::GPUPlace, T> {
-public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  framework::Tensor* output) {
+ public:
+  void operator()(
+    const platform::DeviceContext& context, const framework::Tensor& input,
+    const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -91,12 +81,11 @@ public:
     T* output_data = output->mutable_data<T>(context.GetPlace());
     int threads = 1024;
     int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMax<
-        T><<<grid, threads, 0,
+    KernelUnpool2dMax<T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(input.numel(), input_data, indices_data,
-                              input_height, input_width, output_channels,
-                              output_data, output_height, output_width);
+             .stream()>>>(input.numel(), input_data, indices_data,
+                input_height, input_width, output_channels,
+                output_data, output_height, output_width);
   }
 };
 /*
@@ -104,7 +93,7 @@ public:
  */
 template <typename T>
 class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
-public:
+ public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
@@ -124,13 +113,11 @@ public:
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
     int threads = 1024;
     int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMaxGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(input.numel(), input_data, indices_data,
-                              input_height, input_width, output_channels,
-                              output_data, output_grad_data,
-                              output_height, output_width, input_grad_data);
+    KernelUnpool2dMaxGrad<T><<<grid, threads, 0,
+          reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .stream()>>>(input.numel(), input_data, indices_data,
+              input_height, input_width, output_channels, output_data,
+              output_grad_data, output_height, output_width, input_grad_data);
   }
 };
 template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index 0b969d8d82..43e32bf4fb 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -18,25 +18,20 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
-
 template <typename Place, typename T>
-
 class Unpool2dMaxFunctor {
-public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices, framework::Tensor* output);
+ public:
+  void operator()(
+    const platform::DeviceContext& context, const framework::Tensor& input,
+    const framework::Tensor& indices, framework::Tensor* output);
 };
-
 template <typename Place, class T>
 class Unpool2dMaxGradFunctor {
-public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad);
+ public:
+  void operator()(
+    const platform::DeviceContext& context, const framework::Tensor& input,
+    const framework::Tensor& indices, const framework::Tensor& output,
+    const framework::Tensor& output_grad, framework::Tensor* input_grad);
 };
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index cabf17401b..a51df3aa42 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -31,13 +31,12 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "(Tensor) The input tensor of the indices given out by MaxPool2d. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
-    AddOutput(
-        "Out",
-        "(Tensor) The output tensor of unpool operator."
-        "The format of output tensor is also NCHW."
-        "Where N is batch size, C is "
-        "the number of channels, H and W is the height and "
-        "width of feature.");
+    AddOutput("Out",
+          "(Tensor) The output tensor of unpool operator."
+          "The format of output tensor is also NCHW."
+          "Where N is batch size, C is "
+          "the number of channels, H and W is the height and "
+          "width of feature.");
     AddAttr<std::vector<int>>(
         "ksize",
         "(vector), the unpooling window size(height, width) "
@@ -138,7 +137,7 @@ namespace ops = paddle::operators;
 REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
             ops::UnpoolOpGrad);
 REGISTER_OP_CPU_KERNEL(
-  unpool,ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+  unpool, ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
   ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
   unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
index d8214fc687..8ee9e2b373 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
@@ -15,11 +15,9 @@ limitations under the License. */
 #include "paddle/operators/unpool_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(unpool,
-                ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
-                ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(unpool_grad,
-                       ops::UnpoolGradKernel<paddle::platform::GPUPlace,
-                        float>,
-                       ops::UnpoolGradKernel<paddle::platform::GPUPlace,
-                        double>);
+REGISTER_OP_GPU_KERNEL(
+  unpool, ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+  ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+  unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
+  ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index 8fad768e49..243eb7e532 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-
 template <typename Place, typename T>
 class UnpoolKernel : public framework::OpKernel<T> {
  public:
@@ -41,7 +40,6 @@ class UnpoolKernel : public framework::OpKernel<T> {
     unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
   }
 };
-
 template <typename Place, typename T>
 class UnpoolGradKernel : public framework::OpKernel<T> {
  public:
@@ -69,6 +67,5 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
                           *out_grad, in_x_grad);
   }
 };
-
 }  // namespace operators
 }  // namespace paddle

From d2ee3c98df1203ca68e711a1fb04ddbd6d048b33 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 29 Nov 2017 11:23:46 +0800
Subject: [PATCH 21/36] format code

---
 paddle/operators/math/unpooling.cc            |  16 +--
 paddle/operators/math/unpooling.cu            |  50 ++++----
 paddle/operators/math/unpooling.h             |  17 +--
 paddle/operators/unpool_op.cc                 | 114 +++++++++---------
 paddle/operators/unpool_op.cu.cc              |   8 +-
 .../paddle/v2/fluid/tests/test_unpool_op.py   |   5 +-
 6 files changed, 110 insertions(+), 100 deletions(-)

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index 71928314ba..9017ffaab1 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -19,9 +19,9 @@ namespace math {
 template <typename T>
 class Unpool2dMaxFunctor<platform::CPUPlace, T> {
  public:
-  void operator()(
-    const platform::DeviceContext& context, const framework::Tensor& input,
-    const framework::Tensor& indices, framework::Tensor* output) {
+  void operator()(const platform::DeviceContext& context,
+                const framework::Tensor& input,
+                const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -50,10 +50,12 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
 template <class T>
 class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
  public:
-  void operator()(
-    const platform::DeviceContext& context, const framework::Tensor& input,
-    const framework::Tensor& indices, const framework::Tensor& output,
-    const framework::Tensor& output_grad, framework::Tensor* input_grad) {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index 4c6cb7bbca..f3a317b3b3 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -19,10 +19,12 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-__global__ void KernelUnpool2dMax(
-    const int nthreads, const T* input_data, const int* indices_data,
-    const int input_height, const int input_width, const int channels,
-    T* output_data, const int output_height, const int output_width) {
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                const int* indices_data,
+                                const int input_height, const int input_width,
+                                const int channels, T* output_data,
+                                const int output_height,
+                                const int output_width) {
   int in_n_stride = input_height * input_width * channels;
   int in_c_stride = input_height * input_width;
   int out_n_stride = output_height * output_width * channels;
@@ -40,11 +42,12 @@ __global__ void KernelUnpool2dMax(
   }
 }
 template <typename T>
-__global__ void KernelUnpool2dMaxGrad(
-    const int nthreads, const T* input_data, const int* indices_data,
-    const int input_height, const int input_width, const int channels,
-    const T* output_data, const T* output_grad, const int output_height,
-    const int output_width, T* input_grad) {
+__global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data,
+                                const int* indices_data,
+                                const int input_height, const int input_width,
+                                const int channels, const T* output_data,
+                                const T* output_grad, const int output_height,
+                                const int output_width, T* input_grad) {
   int in_n_stride = input_height * input_width * channels;
   int in_c_stride = input_height * input_width;
   int out_n_stride = output_height * output_width * channels;
@@ -67,9 +70,9 @@ __global__ void KernelUnpool2dMaxGrad(
 template <typename T>
 class Unpool2dMaxFunctor<platform::GPUPlace, T> {
  public:
-  void operator()(
-    const platform::DeviceContext& context, const framework::Tensor& input,
-    const framework::Tensor& indices, framework::Tensor* output) {
+  void operator()(const platform::DeviceContext& context,
+              const framework::Tensor& input, const framework::Tensor& indices,
+              framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -81,11 +84,12 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
     int threads = 1024;
     int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMax<T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-             .stream()>>>(input.numel(), input_data, indices_data,
-                input_height, input_width, output_channels,
-                output_data, output_height, output_width);
+    KernelUnpool2dMax<
+        T><<<grid, threads, 0,
+            reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                .stream()>>>(input.numel(), input_data, indices_data,
+                    input_height, input_width, output_channels,
+                    output_data, output_height, output_width);
   }
 };
 /*
@@ -113,11 +117,13 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
     int threads = 1024;
     int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMaxGrad<T><<<grid, threads, 0,
-          reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .stream()>>>(input.numel(), input_data, indices_data,
-              input_height, input_width, output_channels, output_data,
-              output_grad_data, output_height, output_width, input_grad_data);
+    KernelUnpool2dMaxGrad<
+        T><<<grid, threads, 0,
+            reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                .stream()>>>(input.numel(), input_data, indices_data,
+                    input_height, input_width, output_channels, output_data,
+                    output_grad_data, output_height, output_width,
+                    input_grad_data);
   }
 };
 template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index 43e32bf4fb..61eadcdcd5 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -21,17 +21,20 @@ namespace math {
 template <typename Place, typename T>
 class Unpool2dMaxFunctor {
  public:
-  void operator()(
-    const platform::DeviceContext& context, const framework::Tensor& input,
-    const framework::Tensor& indices, framework::Tensor* output);
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  framework::Tensor* output);
 };
 template <typename Place, class T>
 class Unpool2dMaxGradFunctor {
  public:
-  void operator()(
-    const platform::DeviceContext& context, const framework::Tensor& input,
-    const framework::Tensor& indices, const framework::Tensor& output,
-    const framework::Tensor& output_grad, framework::Tensor* input_grad);
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
 };
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index a51df3aa42..a40aadcccc 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -32,24 +32,22 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
     AddOutput("Out",
-          "(Tensor) The output tensor of unpool operator."
-          "The format of output tensor is also NCHW."
-          "Where N is batch size, C is "
-          "the number of channels, H and W is the height and "
-          "width of feature.");
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
     AddAttr<std::vector<int>>(
         "ksize",
         "(vector), the unpooling window size(height, width) "
         "of unpooling operator.");
-    AddAttr<std::vector<int>>(
-        "strides",
-        "(vector, default:{1, 1}), "
-        "strides (height, width) of unpooling operator.")
+    AddAttr<std::vector<int>>("strides",
+                              "(vector, default:{1, 1}), "
+                              "strides (height, width) of unpooling operator.")
         .SetDefault({1, 1});
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "(vector defalut:{0,0}), "
-        "paddings (height, width) of unpooling operator.")
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector defalut:{0,0}), "
+                              "paddings (height, width) of unpooling operator.")
         .SetDefault({0, 0});
     AddAttr<std::string>(
         "unpooling_type",
@@ -75,71 +73,71 @@ int OutputSize(int input_size, int ksize, int padding, int stride) {
 }
 
 class UnpoolOp : public framework::OperatorWithKernel {
-  protected:
-    framework::OpKernelType GetKernelType(
-      const framework::ExecutionContext& ctx) const override {
+ protected:
+  framework::OpKernelType GetKernelType(
+    const framework::ExecutionContext& ctx) const override {
       return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
         ctx.device_context());
     }
 
-  public:
-    using framework::OperatorWithKernel::OperatorWithKernel;
-    void InferShape(framework::InferShapeContext* ctx) const override {
-      PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp"
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp"
                      "should not be null.");
-      PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp"
+    PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp"
                    "should not be null.");
-      PADDLE_ENFORCE(ctx->HasOutput("Out"),
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of UnpoolOp should not be null.");
-      auto in_x_dims = ctx->GetInputDim("X");
-      auto in_y_dims = ctx->GetInputDim("Indices");
-      std::string unpooling_type =
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
         ctx->Attrs().Get<std::string>("unpooling_type");
-      std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-      std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-      std::vector<int> paddings =
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings =
         ctx->Attrs().Get<std::vector<int>>("paddings");
-      PADDLE_ENFORCE(in_x_dims.size() == 4,
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
                       "Unpooling intput must be of 4-dimensional.");
-      PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
-      std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        output_shape.push_back(
-          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
-      }
-      ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-   }
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+        OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
 };
 
 class UnpoolOpGrad : public framework::OperatorWithKernel {
-  protected:
-    framework::OpKernelType GetKernelType(
-      const framework::ExecutionContext& ctx) const override {
-        return framework::OpKernelType(
-          framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-          ctx.device_context());
-  }
+ protected:
+  framework::OpKernelType GetKernelType(
+    const framework::ExecutionContext& ctx) const override {
+      return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+    }
 
-  public:
-    using framework::OperatorWithKernel::OperatorWithKernel;
-    void InferShape(framework::InferShapeContext* ctx) const override {
-      PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                                   "Input(X@GRAD) should not be null.");
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-   }
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
 };
-}    // namespace operators
-}    // namespace paddle
+} // namespace operators
+} // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
             ops::UnpoolOpGrad);
 REGISTER_OP_CPU_KERNEL(
-  unpool, ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
-  ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
+    unpool, ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+    ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-  unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
-  ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
 
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
index 8ee9e2b373..29b393f474 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-  unpool, ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
-  ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
+    unpool, ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+    ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-  unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
-  ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
index 321cd9fab8..e87f283042 100644
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -55,13 +55,13 @@ class TestUnpoolOp(OpTest):
         self.inputs = {
             'X': input.astype('float32'),
             'Indices': indices.astype('int32')
-            }
+        }
         self.attrs = {
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
             'unpooling_type': self.unpooling_type,
-            }
+        }
         self.outputs = {'Out': output.astype('float32')}
 
     def test_check_output(self):
@@ -78,5 +78,6 @@ class TestUnpoolOp(OpTest):
         self.strides = [2, 2]
         self.paddings = [0, 0]
 
+
 if __name__ == '__main__':
     unittest.main()

From 3206094b5eaf919aac6cdcae46254055ddf98ed9 Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 29 Nov 2017 15:04:56 +0800
Subject: [PATCH 22/36] format code

---
 paddle/operators/math/unpooling.cc |  4 +--
 paddle/operators/math/unpooling.cu | 36 ++++++++++++-------------
 paddle/operators/math/unpooling.h  |  3 +--
 paddle/operators/unpool_op.cc      | 42 ++++++++++++++++--------------
 paddle/operators/unpool_op.cu.cc   |  6 ++---
 5 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index 9017ffaab1..b57d3dc141 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -20,8 +20,8 @@ template <typename T>
 class Unpool2dMaxFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                const framework::Tensor& input,
-                const framework::Tensor& indices, framework::Tensor* output) {
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index f3a317b3b3..058b82d9d2 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -20,11 +20,12 @@ namespace operators {
 namespace math {
 template <typename T>
 __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
-                                const int* indices_data,
-                                const int input_height, const int input_width,
-                                const int channels, T* output_data,
-                                const int output_height,
-                                const int output_width) {
+                                  const int* indices_data,
+                                  const int input_height,
+                                  const int input_width,
+                                  const int channels, T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
   int in_n_stride = input_height * input_width * channels;
   int in_c_stride = input_height * input_width;
   int out_n_stride = output_height * output_width * channels;
@@ -42,12 +43,11 @@ __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
   }
 }
 template <typename T>
-__global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data,
-                                const int* indices_data,
-                                const int input_height, const int input_width,
-                                const int channels, const T* output_data,
-                                const T* output_grad, const int output_height,
-                                const int output_width, T* input_grad) {
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
   int in_n_stride = input_height * input_width * channels;
   int in_c_stride = input_height * input_width;
   int out_n_stride = output_height * output_width * channels;
@@ -71,8 +71,8 @@ template <typename T>
 class Unpool2dMaxFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-              const framework::Tensor& input, const framework::Tensor& indices,
-              framework::Tensor* output) {
+              const framework::Tensor& input,
+              const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -88,8 +88,8 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(input.numel(), input_data, indices_data,
-                    input_height, input_width, output_channels,
-                    output_data, output_height, output_width);
+                             input_height, input_width, output_channels,
+                             output_data, output_height, output_width);
   }
 };
 /*
@@ -121,9 +121,9 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(input.numel(), input_data, indices_data,
-                    input_height, input_width, output_channels, output_data,
-                    output_grad_data, output_height, output_width,
-                    input_grad_data);
+                             input_height, input_width, output_channels,
+                             output_data, output_grad_data, output_height,
+                             output_width, input_grad_data);
   }
 };
 template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index 61eadcdcd5..7077d7c227 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -23,8 +23,7 @@ class Unpool2dMaxFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  framework::Tensor* output);
+                  const framework::Tensor& indices, framework::Tensor* output);
 };
 template <typename Place, class T>
 class Unpool2dMaxGradFunctor {
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index a40aadcccc..8bd596dbb0 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -75,36 +75,38 @@ int OutputSize(int input_size, int ksize, int padding, int stride) {
 class UnpoolOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelType(
-    const framework::ExecutionContext& ctx) const override {
-      return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
         ctx.device_context());
-    }
+  }
 
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp"
-                     "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp"
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) of UnpoolOp"
                    "should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of UnpoolOp should not be null.");
     auto in_x_dims = ctx->GetInputDim("X");
     auto in_y_dims = ctx->GetInputDim("Indices");
-    std::string unpooling_type =
-        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::string unpooling_type = ctx->Attrs()
+                                 .Get<std::string>("unpooling_type");
     std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
     std::vector<int> paddings =
         ctx->Attrs().Get<std::vector<int>>("paddings");
     PADDLE_ENFORCE(in_x_dims.size() == 4,
-                      "Unpooling intput must be of 4-dimensional.");
+                   "Unpooling intput must be of 4-dimensional.");
     PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
     for (size_t i = 0; i < ksize.size(); ++i) {
       output_shape.push_back(
-        OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
     }
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   }
@@ -113,30 +115,30 @@ class UnpoolOp : public framework::OperatorWithKernel {
 class UnpoolOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelType(
-    const framework::ExecutionContext& ctx) const override {
-      return framework::OpKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
         ctx.device_context());
-    }
+  }
 
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                                  "Input(X@GRAD) should not be null.");
+                   "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
-} // namespace operators
-} // namespace paddle
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
             ops::UnpoolOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    unpool, ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
-    ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
     ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
index 29b393f474..18aafb7dc7 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/operators/unpool_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    unpool, ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
-    ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
     ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);

From 4ffb73fd3b353c3d2acfcea3106bfd1a4d4202ee Mon Sep 17 00:00:00 2001
From: sweetsky0901 <work@yq01-idl-gpu-online20.yq01.baidu.com>
Date: Wed, 29 Nov 2017 15:51:28 +0800
Subject: [PATCH 23/36] format ..

---
 paddle/operators/math/unpooling.cu | 25 ++++++++++++-------------
 paddle/operators/unpool_op.cc      |  8 +++-----
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index 058b82d9d2..37c3c8b689 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -21,8 +21,7 @@ namespace math {
 template <typename T>
 __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
                                   const int* indices_data,
-                                  const int input_height,
-                                  const int input_width,
+                                  const int input_height, const int input_width,
                                   const int channels, T* output_data,
                                   const int output_height,
                                   const int output_width) {
@@ -71,8 +70,8 @@ template <typename T>
 class Unpool2dMaxFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-              const framework::Tensor& input,
-              const framework::Tensor& indices, framework::Tensor* output) {
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -86,10 +85,10 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMax<
         T><<<grid, threads, 0,
-            reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                .stream()>>>(input.numel(), input_data, indices_data,
-                             input_height, input_width, output_channels,
-                             output_data, output_height, output_width);
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_height, output_width);
   }
 };
 /*
@@ -119,11 +118,11 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMaxGrad<
         T><<<grid, threads, 0,
-            reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                .stream()>>>(input.numel(), input_data, indices_data,
-                             input_height, input_width, output_channels,
-                             output_data, output_grad_data, output_height,
-                             output_width, input_grad_data);
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_grad_data, output_height,
+                              output_width, input_grad_data);
   }
 };
 template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index 8bd596dbb0..89c48e071c 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -94,12 +94,11 @@ class UnpoolOp : public framework::OperatorWithKernel {
                    "Output(Out) of UnpoolOp should not be null.");
     auto in_x_dims = ctx->GetInputDim("X");
     auto in_y_dims = ctx->GetInputDim("Indices");
-    std::string unpooling_type = ctx->Attrs()
-                                 .Get<std::string>("unpooling_type");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
     std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings =
-        ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
     PADDLE_ENFORCE(in_x_dims.size() == 4,
                    "Unpooling intput must be of 4-dimensional.");
     PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
@@ -142,4 +141,3 @@ REGISTER_OP_CPU_KERNEL(unpool,
 REGISTER_OP_CPU_KERNEL(
     unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
     ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
-

From a5236265b752b9dfad32ae1188798b22eaba9a22 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Wed, 29 Nov 2017 15:55:21 +0800
Subject: [PATCH 24/36] Refine doc for smooth l1 loss op.

---
 paddle/operators/smooth_l1_loss_op.cc | 62 ++++++++++++++++-----------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index ebf7b43700..50543fcc14 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
 
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
     PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The tensor rank of X must be at least 2.");
+                      "The tensor rank of Input(X) should not be less than 2.");
     if (ctx->HasInput("InsideWeight")) {
       PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
                      "If weights are provided, must specify both "
                      "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims,
-                        "The shape of InsideWeight must be same as X.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
-                        "The shape of OutsideWeight must be same as X.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
     }
 
     ctx->SetOutputDim("Diff", x_dims);
@@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "The input tensor of smooth l1 loss op."
-             "The rank should be greater or equal to 2 with shape "
-             "[batch_size, value_dim1, value_dim2, ..., value_dimN]");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The input value of smooth l1 loss op with shape "
+             "[batch_size, dim1, ..., dimN].");
     AddInput("Y",
-             "The target tensor of smooth l1 loss op "
-             "with the same shape as X.");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The target value of smooth l1 loss op with same shape as X.");
     AddInput("InsideWeight",
-             "Optional input tensor of smooth l1 loss op with the same shape "
-             "as X. If provided, the result of (X - Y) will be multiplied "
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the result of (X - Y) will be multiplied "
              "by this tensor element by element.")
         .AsDispensable();
     AddInput("OutsideWeight",
-             "Optinal input of smooth l1 loss op with the same shape as X."
-             "If provided, the output smooth l1 loss will be multiplied by "
-             "this tensor element by element.")
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the out smooth l1 loss will be multiplied by this "
+             "tensor element by element.")
         .AsDispensable();
-    AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).")
+    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
         .AsIntermediate();
-    AddOutput("Out", "Smooth l1 loss.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
+              "The output smooth l1 loss with shape [batch_size, 1].");
     AddAttr<AttrType>("sigma",
                       "Hyper parameter of smooth l1 loss op."
                       "A float scalar with default value 3.0.")
@@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Smooth L1 Loss Operator.
 
-This operator computes the smooth l1 loss for input and target.
-The operator takes the first dimension of input as the batch size.
+This operator computes the smooth l1 loss for X and Y.
+The operator takes the first dimension of X and Y as batch size.
 For each instance, it computes the smooth l1 loss element by element first
-and then sums all the losses. So the resulting output shape
-is [batch_size, 1].
+and then sums all the losses. So the shape of Out is [batch_size, 1].
 
 The equation is:
-loss = $$0.5 * (\sigma * (x-y))^2$$   if $$|x - y| < 1 /({\sigma}^2)$$
-       $$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise
+$$
+Out_{\sigma}(X, Y)_i = \begin{cases}
+0.5 * (\sigma * (X_i - Y_i)) ^ 2
+\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
+\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
+\quad otherwise
+\end{cases}
+$$
+
+In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
 
 )DOC");
   }

From ff8a6778483dcaff32e5e0acc056cf45d12148ff Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Wed, 29 Nov 2017 13:42:42 +0000
Subject: [PATCH 25/36] Revise comments in rank_loss_op

---
 paddle/operators/rank_loss_op.cc | 31 ++++++++++++++++++++-----------
 paddle/operators/rank_loss_op.cu |  2 +-
 paddle/operators/rank_loss_op.h  |  2 +-
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 061e82412e..87774a56f3 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
@@ -35,9 +35,10 @@ class RankLossOp : public framework::OperatorWithKernel {
     auto right_dims = ctx->GetInputDim("Right");
 
     PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
-                   "All inputs must have the same size");
-    PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
-                   "All inputs must be row vector with size batch_size x 1.");
+                   "All inputs must have the same size.");
+    PADDLE_ENFORCE(
+        (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensors with shape [batch_size x 1].");
     ctx->SetOutputDim("Out", label_dims);
   }
 };
@@ -48,10 +49,17 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Label",
-             "The label indicating A ranked higher than B or not, row vector.");
-    AddInput("Left", "The output of RankNet for doc A, vector.");
-    AddInput("Right", "The output of RankNet for doc B, vetor.");
-    AddOutput("Out", "The output loss of RankLoss operator, vector.");
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The label indicating A ranked higher than B or not.");
+    AddInput("Left",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc A.");
+    AddInput("Right",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc B.");
+    AddOutput("Out",
+              "(2-D Tensor with shape [batch_size x 1]) "
+              "The output loss of RankLoss operator.");
     AddComment(R"DOC(
 RankLoss Operator.
 
@@ -65,8 +73,9 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
 the input pair.
 
 The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output of RankNet for the two docs and the label, 
-respectively, and yields the rank loss C_{i,j} using the following equation:
+(P_{i,j}), which represent the output score of RankNet for the two docs and 
+the label respectively, and yields the rank loss C_{i,j} using the following 
+equation:
 
 \f$$
   C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
@@ -74,7 +83,7 @@ respectively, and yields the rank loss C_{i,j} using the following equation:
   \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
 \f$$
 
-The operator can take inputs of one sample or in batch.
+The operator can take batch inputs with size batch_size (batch_size >= 1).
 
 )DOC");
   }
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
index 779588ff36..5382e3a629 100644
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index f184d6efcb..703c77a0b2 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,

From 4d1ee0ff126de91d7705f5587400466926ba5907 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Wed, 29 Nov 2017 13:56:27 +0000
Subject: [PATCH 26/36] Amend license and comments in reshape_op

---
 paddle/operators/reshape_op.cc                       | 7 +++----
 paddle/operators/{reshape_op.cu.cc => reshape_op.cu} | 2 +-
 paddle/operators/reshape_op.h                        | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)
 rename paddle/operators/{reshape_op.cu.cc => reshape_op.cu} (94%)

diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index ba774ec216..39bf2118d6 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -1,11 +1,10 @@
-
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
@@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
     // TODO(qiao) change batch_size
     for (size_t i = 1; i < shape.size(); ++i) {
       PADDLE_ENFORCE(shape[i] > 0,
-                     "Each dimension of shape "
-                     "must be positiv except the first.");
+                     "Each dimension of Attr(shape) "
+                     "must be positive except the first one.");
     }
     if (shape[0] < 0) {
       shape[0] = x_dims[0];
diff --git a/paddle/operators/reshape_op.cu.cc b/paddle/operators/reshape_op.cu
similarity index 94%
rename from paddle/operators/reshape_op.cu.cc
rename to paddle/operators/reshape_op.cu
index 23dbe089d3..dca6c15007 100644
--- a/paddle/operators/reshape_op.cu.cc
+++ b/paddle/operators/reshape_op.cu
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index 0e98c8b4f4..73fd1da642 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,

From 7300655ffd5deb47e24e493524534c94570ed48b Mon Sep 17 00:00:00 2001
From: "Yang Yang(Tony)" <yangyang62@baidu.com>
Date: Wed, 29 Nov 2017 10:51:59 -0800
Subject: [PATCH 27/36] Update cpu_profiling.md (#6012)

---
 doc/howto/optimization/cpu_profiling.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md
index 32d89a7c18..b3330b0b59 100644
--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -71,7 +71,7 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 
 ```
 
-可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python与C++混合代码的性能分析`来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
 
 ```text
 Called By:
@@ -121,7 +121,7 @@ python -m yep -v main.py
 
 1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
 2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
-3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟如果单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
 
 ### 查看性能分析文件
 

From 35572355c2261c493aa782ba1255971f4dfa385e Mon Sep 17 00:00:00 2001
From: kexinzhao <19hskevin87@gmail.com>
Date: Wed, 29 Nov 2017 11:13:39 -0800
Subject: [PATCH 28/36] Edit float16 doc (#5851)

* Add survey of support of half in different CUDA versions

* small fix
---
 doc/design/float16.md | 45 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/doc/design/float16.md b/doc/design/float16.md
index 078801ba2e..1ea95ed6b5 100644
--- a/doc/design/float16.md
+++ b/doc/design/float16.md
@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co
 - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
 - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
 
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
 
 ## Implementation
 The float16 class holds a 16-bit `uint16_t` data internally.

From 00eceea06e0b7e7771c027bac190078f6ed4e77f Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Thu, 30 Nov 2017 10:32:03 +0800
Subject: [PATCH 29/36] Fix the problem that building for Android fails with
 WITH_TESTING=ON. (#6051)

---
 paddle/gserver/tests/CMakeLists.txt | 51 ++++++++++++++---------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index c295ea19c9..24e6cae8e6 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -62,11 +62,11 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
 endif()
 
 if(NOT MOBILE_INFERENCE)
-################## test_Evaluator #######################
+    ################## test_Evaluator #######################
     add_unittest(test_Evaluator
         test_Evaluator.cpp)
       
-############### test_RecurrentGradientMachine ###############
+    ############### test_RecurrentGradientMachine ###############
     # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
     # I will fix it.
     add_unittest_without_exec(test_RecurrentGradientMachine
@@ -77,7 +77,7 @@ if(NOT MOBILE_INFERENCE)
                 ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
       
-############### test_NetworkCompare ###############
+    ############### test_NetworkCompare ###############
     add_unittest_without_exec(test_NetworkCompare
         test_NetworkCompare.cpp)
     if(WITH_GPU)
@@ -89,34 +89,33 @@ if(NOT MOBILE_INFERENCE)
             COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
             WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
     endif()
-endif()
 
+    ################# test_CompareSparse ##################
+    add_unittest_without_exec(test_CompareSparse
+        test_CompareSparse.cpp)
+    if(NOT ON_TRAVIS)
+      add_test(NAME test_CompareSparse
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+              ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+                  ./.set_port.sh -p port -n 6
+                      ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    endif()
+
+    ################ test_CompareTwoNets ######################
+    add_unittest_without_exec(test_CompareTwoNets
+        test_CompareTwoNets.cpp)
+    add_test(NAME test_CompareTwoNets
+      COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+            ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+            ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
 
+################ test_PyDataProvider2 ######################
 add_unittest_without_exec(test_PyDataProvider2
         test_PyDataProvider2.cpp)
-
 add_test(NAME test_PyDataProvider2
    COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
 )
-
-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-          ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-              ./.set_port.sh -p port -n 6
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endif()
-
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-        ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)

From da62d6cc24e22b499204b415f8ab7d4ca96c71d2 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 30 Nov 2017 02:54:37 +0000
Subject: [PATCH 30/36] fix the doc display problem in rank_loss_op

---
 paddle/operators/rank_loss_op.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 87774a56f3..912f88f455 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -77,11 +77,11 @@ The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
 the label respectively, and yields the rank loss C_{i,j} using the following 
 equation:
 
-\f$$
-  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
+$$
+  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
   o_{i,j} =  o_i - o_j  \\
   \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-\f$$
+$$
 
 The operator can take batch inputs with size batch_size (batch_size >= 1).
 

From e1b8c27acbba44a52b10b8593e95eb1279f60bf2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= <typhoonzero1986@gmail.com>
Date: Thu, 30 Nov 2017 12:01:06 +0800
Subject: [PATCH 31/36] Add back print_operators_doc (#5970)

* add back print_operators_doc

* fix style check

* fix style check
---
 paddle/operators/detail/send_recv.proto | 2 +-
 paddle/scripts/docker/build.sh          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
index 962c7d5981..07ff9d2c62 100644
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
@@ -32,4 +32,4 @@ message VariableMessage {
   bytes serialized = 2;
 }
 
-message VoidMessage {}
\ No newline at end of file
+message VoidMessage {}
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index a2fdc5ce69..502637c881 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -183,6 +183,7 @@ EOF
     ${DOCKERFILE_GPU_ENV}
     ADD go/cmd/pserver/pserver /usr/bin/
     ADD go/cmd/master/master /usr/bin/
+    ADD paddle/pybind/print_operators_doc /usr/bin/
     # default command shows the paddle version and exit
     CMD ["paddle", "version"]
 EOF

From dc91c4e3a42b678ad14742af8845b94c4a0ac50d Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 30 Nov 2017 12:13:12 +0800
Subject: [PATCH 32/36] Fix MacOS compile (#6062)

---
 cmake/external/grpc.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 219ea1b908..86122aec8c 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -24,9 +24,9 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
 IF(APPLE)
-  SET(BUILD_CMD make -n | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
 ENDIF()
 
 ExternalProject_Add(
@@ -42,7 +42,7 @@ ExternalProject_Add(
     # Disable -Werror, otherwise the compile will fail in MacOS.
     # It seems that we cannot configure that by make command.
     # Just dry run make command and remove `-Werror`, then use a shell to run make commands
-    BUILD_COMMAND  ${BUILD_CMD} HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin
+    BUILD_COMMAND  ${BUILD_CMD}
     INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )
 

From 82dd1653ae48a54a2ec8371f927812b351164820 Mon Sep 17 00:00:00 2001
From: QI JUN <qijun1994@hotmail.com>
Date: Thu, 30 Nov 2017 12:18:23 +0800
Subject: [PATCH 33/36] Fix python.v2.fluid arg parse (#6055)

* fix python gflags init

* format code
---
 python/paddle/v2/fluid/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index c033b27bea..dd25bc19ec 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -36,7 +36,8 @@ def __read_gflags_from_env__():
     read_env_flags = ['use_pinned_memory']
     if core.is_compile_gpu():
         read_env_flags.append('fraction_of_gpu_memory_to_use')
-    core.init_gflags(sys.argv + ["--tryfromenv=" + ",".join(read_env_flags)])
+    core.init_gflags([sys.argv[0]] +
+                     ["--tryfromenv=" + ",".join(read_env_flags)])
 
 
 __read_gflags_from_env__()

From 35453df18f738c18a7c66d886296068d88dc1304 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 30 Nov 2017 13:41:28 +0800
Subject: [PATCH 34/36] Fix ShareLoD bug (#6084)

Fix #6087
---
 paddle/framework/op_desc.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 48cd131550..02a8253243 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
                       "The %d-th output of Output(%s) must be LoDTensor.", j,
                       out);
-    in_var->SetLoDLevel(out_var->GetLodLevel());
+    out_var->SetLoDLevel(in_var->GetLodLevel());
   }
   bool IsRuntime() const override;
 

From ac596a3952a3f75cc12f1eefafb14a165a57ff95 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 30 Nov 2017 14:14:13 +0800
Subject: [PATCH 35/36] Feature/switch program (#5932)

* Unify fluid submodules to fluid module

Change books just use `import fluid`, not submodules

* Remove g_main_program/g_startup_program

Use default_main_program/default_startup_program instead

* Typo

* Add API for switch default program

* Two functions: switch_main_program/switch_startup_program
* A guard: program_guard. Users can use the `with` statement change
  default programs
* Change unittests in `test_layers`

* Fix CI

* Fix CI

* Fix CI
---
 python/paddle/v2/fluid/framework.py         |  79 +++++-
 python/paddle/v2/fluid/tests/test_layers.py | 271 ++++++++------------
 2 files changed, 188 insertions(+), 162 deletions(-)

diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 1c42e4d44f..49c6d89834 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -3,10 +3,12 @@ import collections
 import numpy as np
 from . import core
 import proto.framework_pb2 as framework_pb2
+import contextlib
 
 __all__ = [
     'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
-    'default_main_program'
+    'default_main_program', 'program_guard', 'switch_startup_program',
+    'switch_main_program'
 ]
 
 
@@ -659,8 +661,83 @@ _startup_program_ = Program()
 
 
 def default_startup_program():
+    """
+    Get default startup program. In startup program, Paddle will initialize
+    parameters, initialize nccl handle, etc.
+    
+    Returns:
+        Program: startup program
+    """
     return _startup_program_
 
 
 def default_main_program():
+    """
+    Get default main program. The main program is used for training or testing.
+    
+    Returns:
+        Program: main program
+    """
     return _main_program_
+
+
+def switch_main_program(program):
+    """
+    Switch the main program to a new program.
+    
+    Args:
+        program(Program): The new main program
+
+    Returns:
+        Program: The previous main program
+    """
+    global _main_program_
+    prev_program = _main_program_
+    _main_program_ = program
+    return prev_program
+
+
+def switch_startup_program(program):
+    """
+    Switch the startup program to a new program 
+    Args:
+        program(Program): The new startup program
+
+    Returns:
+        Program: The previous startup program
+    """
+    global _startup_program_
+    prev_program = _startup_program_
+    _startup_program_ = program
+    return prev_program
+
+
+@contextlib.contextmanager
+def program_guard(main_program, startup_program=None):
+    """
+    Switch program with `with` statement
+    
+    Examples:
+        >>> with program_guard(Program()):
+        >>>   data = fluid.layers.data(...)
+        >>>   hidden = fluid.layers.fc(...)
+        
+    Args:
+        main_program(Program): New main program inside `with` statement
+        startup_program(Program): New startup program inside `with` statement. 
+            None means do not change startup program.
+
+    Returns:
+        None
+    """
+    if not isinstance(main_program, Program):
+        raise TypeError("main_program should be Program")
+    main_program = switch_main_program(main_program)
+    if startup_program is not None:
+        if not isinstance(startup_program, Program):
+            raise TypeError("startup_program should be Program")
+        startup_program = switch_startup_program(startup_program)
+    yield
+    switch_main_program(main_program)
+    if startup_program is not None:
+        switch_startup_program(startup_program)
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index b6906be60b..33b0e54f42 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -1,192 +1,141 @@
+from __future__ import print_function
 import unittest
 
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 
 
 class TestBook(unittest.TestCase):
     def test_fit_a_line(self):
         program = Program()
-        x = layers.data(
-            name='x', shape=[13], dtype='float32', main_program=program)
-        y_predict = layers.fc(input=x, size=1, act=None, main_program=program)
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+            program.append_backward(avg_cost)
 
-        y = layers.data(
-            name='y', shape=[1], dtype='float32', main_program=program)
-        cost = layers.square_error_cost(
-            input=y_predict, label=y, main_program=program)
-
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-        program.append_backward(avg_cost)
-
-        print str(program)
+        print(str(program))
 
     def test_recognize_digits_mlp(self):
         program = Program()
-
-        # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
-            name='pixel', shape=[784], dtype='float32', main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        hidden1 = layers.fc(input=images,
-                            size=128,
-                            act='relu',
-                            main_program=program)
-        hidden2 = layers.fc(input=hidden1,
-                            size=64,
-                            act='relu',
-                            main_program=program)
-        predict = layers.fc(input=hidden2,
-                            size=10,
-                            act='softmax',
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            # Change g_program, so the rest layers use `g_program`
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden1 = layers.fc(input=images, size=128, act='relu')
+            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
+            predict = layers.fc(input=hidden2, size=10, act='softmax')
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
 
     def test_simple_conv2d(self):
         program = Program()
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='int32',
-            main_program=program)
-        layers.conv2d(
-            input=images,
-            num_filters=3,
-            filter_size=[4, 4],
-            main_program=program)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
+            layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
+
+        print(str(program))
 
     def test_conv2d_transpose(self):
         program = Program()
-        kwargs = {'main_program': program}
-        img = layers.data(
-            name='pixel', shape=[3, 2, 2], dtype='float32', **kwargs)
-        layers.conv2d_transpose(
-            input=img, num_filters=10, output_size=28, **kwargs)
-        print str(program)
+        with program_guard(program):
+            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
+        print(str(program))
 
     def test_recognize_digits_conv(self):
         program = Program()
-
-        images = layers.data(
-            name='pixel',
-            shape=[1, 28, 28],
-            dtype='float32',
-            main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        conv_pool_1 = nets.simple_img_conv_pool(
-            input=images,
-            filter_size=5,
-            num_filters=2,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            main_program=program)
-        conv_pool_2 = nets.simple_img_conv_pool(
-            input=conv_pool_1,
-            filter_size=5,
-            num_filters=4,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            main_program=program)
-
-        predict = layers.fc(input=conv_pool_2,
-                            size=10,
-                            act="softmax",
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-
-        program.append_backward(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            conv_pool_1 = nets.simple_img_conv_pool(
+                input=images,
+                filter_size=5,
+                num_filters=2,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+            conv_pool_2 = nets.simple_img_conv_pool(
+                input=conv_pool_1,
+                filter_size=5,
+                num_filters=4,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+
+            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+
+            program.append_backward(avg_cost)
+
+        print(str(program))
 
     def test_word_embedding(self):
         program = Program()
-        dict_size = 10000
-        embed_size = 32
-        first_word = layers.data(
-            name='firstw', shape=[1], dtype='int64', main_program=program)
-        second_word = layers.data(
-            name='secondw', shape=[1], dtype='int64', main_program=program)
-        third_word = layers.data(
-            name='thirdw', shape=[1], dtype='int64', main_program=program)
-        forth_word = layers.data(
-            name='forthw', shape=[1], dtype='int64', main_program=program)
-        next_word = layers.data(
-            name='nextw', shape=[1], dtype='int64', main_program=program)
-
-        embed_first = layers.embedding(
-            input=first_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr='shared_w',
-            main_program=program)
-        embed_second = layers.embedding(
-            input=second_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr='shared_w',
-            main_program=program)
-
-        embed_third = layers.embedding(
-            input=third_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr='shared_w',
-            main_program=program)
-        embed_forth = layers.embedding(
-            input=forth_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr='shared_w',
-            main_program=program)
-
-        concat_embed = layers.concat(
-            input=[embed_first, embed_second, embed_third, embed_forth],
-            axis=1,
-            main_program=program)
-
-        hidden1 = layers.fc(input=concat_embed,
-                            size=256,
-                            act='sigmoid',
-                            main_program=program)
-        predict_word = layers.fc(input=hidden1,
-                                 size=dict_size,
-                                 act='softmax',
-                                 main_program=program)
-        cost = layers.cross_entropy(
-            input=predict_word, label=next_word, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            dict_size = 10000
+            embed_size = 32
+            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
+            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
+            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
+            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
+            next_word = layers.data(name='nextw', shape=[1], dtype='int64')
+
+            embed_first = layers.embedding(
+                input=first_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_second = layers.embedding(
+                input=second_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            embed_third = layers.embedding(
+                input=third_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_forth = layers.embedding(
+                input=forth_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            concat_embed = layers.concat(
+                input=[embed_first, embed_second, embed_third, embed_forth],
+                axis=1)
+
+            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
+            predict_word = layers.fc(input=hidden1,
+                                     size=dict_size,
+                                     act='softmax')
+            cost = layers.cross_entropy(input=predict_word, label=next_word)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
 
     def test_linear_chain_crf(self):
         program = Program()
-
-        # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
-            name='pixel', shape=[784], dtype='float32', main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        hidden = layers.fc(input=images, size=128, main_program=program)
-        crf = layers.linear_chain_crf(
-            input=hidden, label=label, main_program=program)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden = layers.fc(input=images, size=128)
+            crf = layers.linear_chain_crf(input=hidden, label=label)
+            self.assertNotEqual(crf, None)
+
+        print(str(program))
 
 
 if __name__ == '__main__':

From 605b3e449911420e5a171085d457916d668268e1 Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Wed, 29 Nov 2017 23:22:19 -0800
Subject: [PATCH 36/36] Translate the CPU profiling document (#6073)

* Translate the CPU profiling document

* Paragraphing
---
 doc/howto/optimization/cpu_profiling.md    | 166 +++++++++++++--------
 doc/howto/optimization/cpu_profiling_cn.md | 155 +++++++++++++++++++
 2 files changed, 255 insertions(+), 66 deletions(-)
 create mode 100644 doc/howto/optimization/cpu_profiling_cn.md

diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md
index b3330b0b59..e1d91c668e 100644
--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -1,42 +1,52 @@
-此教程会介绍如何使用Python的cProfile包，与Python库yep，google perftools来运行性能分析(Profiling)与调优。
+This tutorial introduces techniques we used to profile and tune the
+CPU performance of PaddlePaddle.  We will use Python packages
+`cProfile` and `yep`, and Google `perftools`.
 
-运行性能分析可以让开发人员科学的，有条不紊的对程序进行性能优化。性能分析是性能调优的基础。因为在程序实际运行中，真正的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。
+Profiling is the process that reveals the performance bottlenecks,
+which could be very different from what's in the developers' mind.
+Performance tuning is to fix the bottlenecks. Performance optimization
+repeats the steps of profiling and tuning alternatively.
 
-性能优化的步骤，通常是循环重复若干次『性能分析 --> 寻找瓶颈 ---> 调优瓶颈 --> 性能分析确认调优效果』。其中性能分析是性能调优的至关重要的量化指标。
+PaddlePaddle users program AI by calling the Python API, which calls
+into `libpaddle.so.` written in C++.  In this tutorial, we focus on
+the profiling and tuning of
 
-Paddle提供了Python语言绑定。用户使用Python进行神经网络编程，训练，测试。Python解释器通过`pybind`和`swig`调用Paddle的动态链接库，进而调用Paddle C++部分的代码。所以Paddle的性能分析与调优分为两个部分:
+1. the Python code and
+1. the mixture of Python and C++ code.
 
-* Python代码的性能分析
-* Python与C++混合代码的性能分析
+## Profiling the Python Code
 
+### Generate the Performance Profiling File
 
-## Python代码的性能分析
-
-### 生成性能分析文件
-
-Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+We can use Python standard
+package, [`cProfile`](https://docs.python.org/2/library/profile.html),
+to generate Python profiling file.  For example:
 
 ```bash
 python -m cProfile -o profile.out main.py
 ```
 
-其中`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印一些统计信息到`stdout`。这不方便我们进行后期处理(进行`sort`, `split`, `cut`等等)。
-
-### 查看性能分析文件
+where `main.py` is the program we are going to profile, `-o` specifies
+the output file.  Without `-o`, `cProfile` would outputs to standard
+output.
 
-当main.py运行完毕后，性能分析结果文件`profile.out`就生成出来了。我们可以使用[cprofilev](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来。
+### Look into the Profiling File
 
-使用`pip install cprofilev`安装`cprofilev`工具。安装完成后，使用如下命令开启HTTP服务
+`cProfile` generates `profile.out` after `main.py` completes. We can
+use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
+the details:
 
 ```bash
 cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 ```
 
-其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
+specifies the profiling file, and `main.py` is the source file.
 
-访问对应网址，即可显示性能分析的结果。性能分析结果格式如下:
+Open the Web browser and points to the local IP and the specifies
+port, we will see the output like the following:
 
-```text
+```
    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
         1    0.284    0.284   29.514   29.514 main.py:1(<module>)
      4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
@@ -44,23 +54,23 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
         1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
 ```
 
-每一列的含义是:
+where each line corresponds to Python function, and the meaning of
+each column is as follows:
 
-| 列名 | 含义 |
+| column | meaning |
 | --- | --- |
-| ncalls | 函数的调用次数 |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
-| percall | tottime的每次调用平均时间 |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
-| percall | cumtime的每次调用平均时间 |
-| filename:lineno(function) | 文件名, 行号，函数名 |
+| ncalls | the number of calls into a function |
+| tottime | the total execution time of the function, not including the
+ execution time of other functions called by the function |
+| percall | tottime divided by ncalls |
+| cumtime | the total execution time of the function, including the execution time of other functions being called |
+| percall | cumtime divided by ncalls |
+| filename:lineno(function) | where the function is defined |
 
+### Identify Performance Bottlenecks
 
-### 寻找性能瓶颈
-
-通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
-
-将性能分析结果按照tottime排序，效果如下:
+Usually, `tottime` and the related `percall` time is what we want to
+focus on. We can sort above profiling file by tottime:
 
 ```text
      4696   12.040    0.003   12.040    0.003 {built-in method run}
@@ -68,12 +78,15 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
    107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
      4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
         1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
-
 ```
 
-可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+We can see that the most time-consuming function is the `built-in
+method run`, which is a C++ function in `libpaddle.so`.  We will
+explain how to profile C++ code in the next section.  At the right
+moment, let's look into the third function `sync_with_cpp`, which is a
+Python function.  We can click it to understand more about it:
 
-```text
+```
 Called By:
 
    Ordered by: internal time
@@ -92,72 +105,93 @@ Called:
    List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
 ```
 
-通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+The lists of the callers of `sync_with_cpp` might help us understand
+how to improve the function definition.
 
+## Profiling Python and C++ Code
 
+### Generate the Profiling File
 
-## Python与C++混合代码的性能分析
+To profile a mixture of Python and C++ code, we can use a Python
+package, `yep`, that can work with Google's `perftools`, which is a
+commonly-used profiler for C/C++ code.
 
-### 生成性能分析文件
-
-C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
-
-使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+In Ubuntu systems, we can install `yep` and `perftools` by running the
+following commands:
 
 ```bash
+apt update
 apt install libgoogle-perftools-dev
 pip install yep
 ```
 
-安装完毕后，我们可以通过
+Then we can run the following command
 
 ```bash
 python -m yep -v main.py
 ```
 
-生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+to generate the profiling file.  The default filename is
+`main.py.prof`.
+
+Please be aware of the `-v` command line option, which prints the
+analysis results after generating the profiling file.  By taking a
+glance at the print result, we'd know that if we stripped debug
+information from `libpaddle.so` at build time.  The following hints
+help make sure that the analysis results are readable:
 
-命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+1. Use GCC command line option `-g` when building `libpaddle.so` so to
+   include the debug information.  The standard building system of
+   PaddlePaddle is CMake, so you might want to set
+   `CMAKE_BUILD_TYPE=RelWithDebInfo`.
 
-1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
-2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
-3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+1. Use GCC command line option `-O2` or `-O3` to generate optimized
+   binary code. It doesn't make sense to profile `libpaddle.so`
+   without optimization, because it would anyway run slowly.
 
-### 查看性能分析文件
+1. Profiling the single-threaded binary file before the
+   multi-threading version, because the latter often generates tangled
+   profiling analysis result.  You might want to set environment
+   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
+   starting multiple threads.
 
-在运行完性能分析后，会生成性能分析结果文件。我们可以使用[pprof](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+### Look into the Profiling File
 
-安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+The tool we used to look into the profiling file generated by
+`perftools` is [`pprof`](https://github.com/google/pprof), which
+provides a Web-based GUI like `cprofilev`.
+
+We can rely on the standard Go toolchain to retrieve the source code
+of `pprof` and build it:
 
 ```bash
 go get github.com/google/pprof
 ```
 
-进而我们可以使用如下命令开启一个HTTP服务:
+Then we can use it to profile `main.py.prof` generated in the previous
+section:
 
 ```bash
 pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
 ```
 
-这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
-
-访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+Where `-http` specifies the IP and port of the HTTP service.
+Directing our Web browser to the service, we would see something like
+the following:
 
 ![result](./pprof_1.png)
 
+### Identifying the Performance Bottlenecks
 
-### 寻找性能瓶颈
-
-与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
-
-例如下图中，
+Similar to how we work with `cprofilev`, we'd focus on `tottime` and
+`cumtime`.
 
 ![kernel_perf](./pprof_2.png)
 
-在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
-
-在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
-
-## 总结
+We can see that the execution time of multiplication and the computing
+of the gradient of multiplication takes 2% to 4% of the total running
+time, and `MomentumOp` takes about 17%. Obviously, we'd want to
+optimize `MomentumOp`.
 
-至此，两种性能分析的方式都介绍完毕了。希望通过这两种性能分析的方式，Paddle的开发人员和使用人员可以有次序的，科学的发现和解决性能问题。
+`pprof` would mark performance critical parts of the program in
+red. It's a good idea to follow the hint.
diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md
new file mode 100644
index 0000000000..14eba0e2f3
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling_cn.md
@@ -0,0 +1,155 @@
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
+
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中 `main.py` 是我们要分析的程序，`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印到标准输出。
+
+### 查看性能分析文件
+
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来：
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+用Web浏览器访问对应网址，即可显示性能分析的结果：
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+每一列的含义是:
+
+| 列名 | 含义 |
+| --- | --- |
+| ncalls | 函数的调用次数 |
+| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| percall | tottime的每次调用平均时间 |
+| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | cumtime的每次调用平均时间 |
+| filename:lineno(function) | 文件名, 行号，函数名 |
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序，效果如下:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+
+```text
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后，我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+
+![result](./pprof_1.png)
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中，
+
+![kernel_perf](./pprof_2.png)
+
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。