From 298e74da1e6a1b8512cf2f7f0093e5659fca51f7 Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Thu, 28 Jun 2018 09:09:24 +0000
Subject: [PATCH 01/45] add squeeze op c++ part, compile success

---
 paddle/fluid/operators/squeeze_op.cc | 155 +++++++++++++++++++++++++++
 paddle/fluid/operators/squeeze_op.cu |  30 ++++++
 paddle/fluid/operators/squeeze_op.h  |  73 +++++++++++++
 3 files changed, 258 insertions(+)
 create mode 100644 paddle/fluid/operators/squeeze_op.cc
 create mode 100644 paddle/fluid/operators/squeeze_op.cu
 create mode 100644 paddle/fluid/operators/squeeze_op.h
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
new file mode 100644
index 0000000000..8f453b059f
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/squeeze_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+class SqueezeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SqueezeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SqueezeOp should not be null.");
+
+    const auto& x_dims = ctx->GetInputDim("X");
+    // TODO(chenweihang): need check input tensor dims (<9).
+
+    const auto& axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    // TODO(chenweihang): need check axes is valid.
+    // PADDLE_ENFORCE();
+    for (int a : axes) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The axis must be less than input tensor's rank.");
+    }
+
+    auto out_dims = GetOutputShape(axes, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    // TODO(chenweihang): need other check.
+  }
+
+  static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
+                                        const framework::DDim& in_dims) {
+    int num_squeeze_dims = squeeze_dims.size();
+    int cnt_squeezed_dims = 0;
+    bool should_squeeze[9] = {false};
+
+    // Determines number of dimensions of output tensor after squeeze.
+    // Mark and count the dimensions need to be squeezed
+    if (num_squeeze_dims == 0) {
+      for (int idx = 0; idx < in_dims.size(); ++idx) {
+        if (in_dims[idx] == 1) {
+          should_squeeze[idx] = true;
+          ++cnt_squeezed_dims;
+        }
+      }
+    } else {
+      for (int idx = 0; idx < num_squeeze_dims; ++idx) {
+        int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
+                                            : squeeze_dims[idx];
+        // TODO(chenweihang): shoude use PADALE_ENFORCE ? or if.
+        PADDLE_ENFORCE_GE(current, 0, "Invalid axis is given.");
+        PADDLE_ENFORCE_LT(current, in_dims.size(), "Invalid axis is given.");
+        PADDLE_ENFORCE_EQ(in_dims[current], 1, "Invalid axis is given.");
+
+        if (!(should_squeeze[current])) ++cnt_squeezed_dims;
+        should_squeeze[current] = true;
+      }
+    }
+
+    // Make output dimensions
+    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
+    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
+      if (!should_squeeze[in_idx]) {
+        output_shape[out_idx++] = in_dims[in_idx];
+      }
+    }
+
+    return framework::make_ddim(output_shape);
+  }
+};
+
+class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), Tensors with at least max(dims) dimensions.");
+    AddOutput("Out", "(Tensor), Reshaped tensor with same data as input.");
+    AddAttr<std::vector<int>>("axes",
+                              "List of positive integers,"
+                              " indicate the dimensions to squeeze.");
+    AddAttr<bool>("inplace",
+                  "(default: false) Change the source tensor's shape without "
+                  "memory copy. When Attr(inplace) is set true, the output "
+                  "tensor shares memory with Input(X), otherwise, a new output "
+                  "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
+    AddComment(R"DOC(
+		Squeeze Operator.
+		
+		Remove single-dimensional entries from the shape of a tensor. 
+		Takes a parameter axes with a list of axes to squeeze. 
+		If axes is not provided, all the single dimensions will be removed from the shape. 
+        If an axis is selected with shape entry not equal to one, an error is raised.
+    )DOC");
+  }
+};
+
+class SqueezeGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SqueezeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Output(Out@GRAD/) of SqueezeOp should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp);
+REGISTER_OP_CPU_KERNEL(
+    squeeze, ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    squeeze_grad,
+    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/squeeze_op.cu b/paddle/fluid/operators/squeeze_op.cu
new file mode 100644
index 0000000000..1096907daa
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/squeeze_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    squeeze, ops::SqueezeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    squeeze_grad,
+    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
new file mode 100644
index 0000000000..ce6f40e7a4
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SqueezeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
+
+    framework::DDim out_dims = out->dims();
+
+    // TODO(chenweihang): Where is this attr be add.
+    bool inplace = ctx.Attr<bool>("inplace");
+    out->Resize(out_dims);
+    if (!inplace) {
+      out->mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopySync(*in, ctx.GetPlace(), out);
+      out->Resize(out_dims);
+    } else {
+      out->ShareDataWith(*in);
+      out->Resize(out_dims);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SqueezeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    bool inplace = ctx.Attr<bool>("inplace");
+
+    auto in_dims = d_x->dims();
+    if (!inplace) {
+      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      ctx.device_context().Wait();
+      d_x->Resize(in_dims);
+    } else {
+      d_x->ShareDataWith(*d_out);
+      d_x->Resize(in_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

From 70729ad6416eecb8cb7f4e1d648f83e92bb73bdf Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Fri, 29 Jun 2018 13:13:05 +0000
Subject: [PATCH 02/45] Add Unsqueeze Operator Framework, not finshed

---
 paddle/fluid/operators/unsqueeze_op.cc        | 148 ++++++++++++++++++
 paddle/fluid/operators/unsqueeze_op.cu        |  30 ++++
 paddle/fluid/operators/unsqueeze_op.h         |  72 +++++++++
 .../tests/unittests/test_unsqueeze_op.py      |  98 ++++++++++++
 4 files changed, 348 insertions(+)
 create mode 100644 paddle/fluid/operators/unsqueeze_op.cc
 create mode 100644 paddle/fluid/operators/unsqueeze_op.cu
 create mode 100644 paddle/fluid/operators/unsqueeze_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_unsqueeze_op.py

diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
new file mode 100644
index 0000000000..8d2a186685
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/unsqueeze_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+class UnsqueezeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnsqueezeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnsqueezeOp should not be null.");
+
+    const auto& x_dims = ctx->GetInputDim("X");
+    const auto& axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    // Check output tensor dims (<9).
+    PADDLE_ENFORCE_LE(x_dims.size() + axes.size(), 9,
+                      "Invalid dimnesions, dynamic dimensions must have "
+                      "between [1, 9] dimensions.");
+    // Check the range of unsqueeze aixs.
+    for (int a : axes) {
+      PADDLE_ENFORCE_LT(a, static_cast<int64_t>(x_dims.size() + axes.size()),
+                        "The axis must be less than output tensor's rank.");
+    }
+
+    auto out_dims = GetOutputShape(axes, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+  }
+
+  static framework::DDim GetOutputShape(const std::vector<int> unsqueeze_dims,
+                                        const framework::DDim& in_dims) {
+    int out_dims_size = in_dims.size() + unsqueeze_dims.size();
+    bool should_unsqueeze[9] = {false};
+
+    // Determines the dimensions should be unsqueezed in output tensor after.
+    for (unsigned int idx = 0; idx < unsqueeze_dims.size(); ++idx) {
+      int current = unsqueeze_dims[idx] < 0
+                        ? unsqueeze_dims[idx] + out_dims_size
+                        : unsqueeze_dims[idx];
+      // Check current index.
+      PADDLE_ENFORCE_GE(current, 0,
+                        "Invaild axis, negative axis is out of range.");
+      should_unsqueeze[idx] = true;
+    }
+
+    // Make output dimensions
+    std::vector<int64_t> output_shape(out_dims_size, 0);
+    for (int in_idx = 0, out_idx = 0; out_idx < out_dims_size; ++out_idx) {
+      if (!should_unsqueeze[out_idx]) {
+        output_shape[out_idx] = in_dims[in_idx++];
+      } else {
+        output_shape[out_idx] = 1;
+      }
+    }
+
+    return framework::make_ddim(output_shape);
+  }
+};
+
+class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor). The input tensor of unsqueeze operator.");
+    AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator.");
+    AddAttr<std::vector<int>>("axes",
+                              "(std::vector<int>). List of positive integers,"
+                              " indicate the dimensions to be inserted");
+    AddAttr<bool>(
+        "inplace",
+        "(default: false) Unsqueeze the source tensor's shape without "
+        "memory copy. When Attr(inplace) is set true, the output "
+        "tensor shares memory with Input(X), otherwise, a new output "
+        "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
+    AddComment(R"DOC(
+		Unsqueeze Operator.
+		
+		Insert single-dimensional entries to the shape of a tensor. 
+		Takes one required argument axes, a list of dimensions that will be inserted. 
+		Dimension indices in axes are as seen in the output tensor. 
+
+		For example: 
+		  Given a tensor such that tensor with shape [3, 4, 5], 
+		  then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
+    )DOC");
+  }
+};
+
+class UnsqueezeGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnsqueezeGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Output(Out@GRAD) of UnsqueezeGradOp should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp);
+REGISTER_OP_CPU_KERNEL(
+    unsqueeze, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    unsqueeze_grad,
+    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cu b/paddle/fluid/operators/unsqueeze_op.cu
new file mode 100644
index 0000000000..891f6cc548
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/unsqueeze_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    squeeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    squeeze_grad,
+    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
new file mode 100644
index 0000000000..aa45fb3113
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class UnsqueezeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
+
+    framework::DDim out_dims = out->dims();
+
+    bool inplace = ctx.Attr<bool>("inplace");
+    out->Resize(out_dims);
+    if (!inplace) {
+      out->mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopySync(*in, ctx.GetPlace(), out);
+      out->Resize(out_dims);
+    } else {
+      out->ShareDataWith(*in);
+      out->Resize(out_dims);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class UnsqueezeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    bool inplace = ctx.Attr<bool>("inplace");
+
+    auto in_dims = d_x->dims();
+    if (!inplace) {
+      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      ctx.device_context().Wait();
+      d_x->Resize(in_dims);
+    } else {
+      d_x->ShareDataWith(*d_out);
+      d_x->Resize(in_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
new file mode 100644
index 0000000000..273a2c075f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -0,0 +1,98 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+# Correct: General.
+class TestSqueezeOp1(OpTest):
+    def setUp(self):
+        ori_shape = (3, 5)
+        axes = (0, 2)
+        new_shape = (1, 3, 1, 5)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+# Correct: There is mins axis.
+class TestSqueezeOp2(OpTest):
+    def setUp(self):
+        ori_shape = (3, 5)
+        axes = (0, -2)
+        new_shape = (1, 3, 1, 5)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+        def test_check_output(self):
+            self.check_output()
+
+        def test_check_grad(self):
+            self.check_grad(["X"], "Out")
+
+
+# Correct: Inplace.
+class TestUnsqueezeOpInplace1(OpTest):
+    def setUp(self):
+        ori_shape = (3, 5)
+        axes = (0, 2)
+        new_shape = (1, 3, 1, 5)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inplace": True}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+# Correct: Inplace. There is mins axis.
+class TestUnsqueezeOpInplace2(OpTest):
+    def setUp(self):
+        ori_shape = (3, 5)
+        axes = (0, -2)
+        new_shape = (1, 3, 1, 5)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": True}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+if __name__ == "__main__":
+    unittest.main()

From 9ca88fa8a5637e6aa859578ce204cb4d343334c5 Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Fri, 29 Jun 2018 10:27:15 +0000
Subject: [PATCH 03/45] Adjust squeeze op and code the unittest, test passed

---
 paddle/fluid/operators/squeeze_op.cc          |  61 ++++--
 paddle/fluid/operators/squeeze_op.h           |   1 -
 .../fluid/tests/unittests/test_squeeze_op.py  | 174 ++++++++++++++++++
 3 files changed, 218 insertions(+), 18 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_squeeze_op.py

diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 8f453b059f..639480aba4 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -33,11 +33,12 @@ class SqueezeOp : public framework::OperatorWithKernel {
                    "Output(Out) of SqueezeOp should not be null.");
 
     const auto& x_dims = ctx->GetInputDim("X");
-    // TODO(chenweihang): need check input tensor dims (<9).
+    // Check input tensor dims (<9).
+    PADDLE_ENFORCE(x_dims.size() <= 9,
+                   "Invalid dimnesions, dynamic dimensions must have "
+                   "between [1, 9] dimensions.");
 
     const auto& axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    // TODO(chenweihang): need check axes is valid.
-    // PADDLE_ENFORCE();
     for (int a : axes) {
       PADDLE_ENFORCE_LT(a, x_dims.size(),
                         "The axis must be less than input tensor's rank.");
@@ -45,7 +46,12 @@ class SqueezeOp : public framework::OperatorWithKernel {
 
     auto out_dims = GetOutputShape(axes, x_dims);
     ctx->SetOutputDim("Out", out_dims);
-    // TODO(chenweihang): need other check.
+    // TODO(chenweihang): This share option is necessary?
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
   }
 
   static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
@@ -67,12 +73,17 @@ class SqueezeOp : public framework::OperatorWithKernel {
       for (int idx = 0; idx < num_squeeze_dims; ++idx) {
         int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
                                             : squeeze_dims[idx];
-        // TODO(chenweihang): shoude use PADALE_ENFORCE ? or if.
-        PADDLE_ENFORCE_GE(current, 0, "Invalid axis is given.");
-        PADDLE_ENFORCE_LT(current, in_dims.size(), "Invalid axis is given.");
-        PADDLE_ENFORCE_EQ(in_dims[current], 1, "Invalid axis is given.");
-
-        if (!(should_squeeze[current])) ++cnt_squeezed_dims;
+        // Check current index.
+        PADDLE_ENFORCE(current >= 0,
+                       "Invalid axis, negative axis is out of range.");
+        // PADDLE_ENFORCE_LT(current, in_dims.size(), "Invalid axis is given.");
+        PADDLE_ENFORCE(
+            in_dims[current] == 1,
+            "Invalid axis index, the axis will be squeezed should be 1.");
+
+        if (!(should_squeeze[current])) {
+          ++cnt_squeezed_dims;
+        }
         should_squeeze[current] = true;
       }
     }
@@ -92,13 +103,14 @@ class SqueezeOp : public framework::OperatorWithKernel {
 class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(Tensor), Tensors with at least max(dims) dimensions.");
-    AddOutput("Out", "(Tensor), Reshaped tensor with same data as input.");
+    AddInput("X", "(Tensor). The input tensor of squeeze operator.");
+    AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
     AddAttr<std::vector<int>>("axes",
-                              "List of positive integers,"
-                              " indicate the dimensions to squeeze.");
+                              "(std::vector<int>). List of positive integers,"
+                              " indicate the dimensions to squeeze.")
+        .SetDefault({});
     AddAttr<bool>("inplace",
-                  "(default: false) Change the source tensor's shape without "
+                  "(default: false) Squeeze the source tensor's shape without "
                   "memory copy. When Attr(inplace) is set true, the output "
                   "tensor shares memory with Input(X), otherwise, a new output "
                   "tensor is created, and its data are copied from Input(x).")
@@ -110,6 +122,21 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
 		Takes a parameter axes with a list of axes to squeeze. 
 		If axes is not provided, all the single dimensions will be removed from the shape. 
         If an axis is selected with shape entry not equal to one, an error is raised.
+		
+		Examples:
+		Case 1:
+		  Given 
+			X.shape = (1, 3, 1, 5)
+		  and
+			axes = [0]
+		  we get:
+			Out.shape = (3, 1, 5)
+
+		Case 2:
+		  Given
+			X.shape = (1, 3, 1, 5)
+		  we get:
+			Out.shape = (3, 5)
     )DOC");
   }
 };
@@ -120,9 +147,9 @@ class SqueezeGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SqueezeOp should not be null.");
+                   "Input(X) of SqueezeGradOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Output(Out@GRAD/) of SqueezeOp should not be null.");
+                   "Output(Out@GRAD) of SqueezeGradOp should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
index ce6f40e7a4..44ef324c7d 100644
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -33,7 +33,6 @@ class SqueezeKernel : public framework::OpKernel<T> {
 
     framework::DDim out_dims = out->dims();
 
-    // TODO(chenweihang): Where is this attr be add.
     bool inplace = ctx.Attr<bool>("inplace");
     out->Resize(out_dims);
     if (!inplace) {
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
new file mode 100644
index 0000000000..58c87ea3c1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -0,0 +1,174 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+# Correct: General.
+class TestSqueezeOp1(OpTest):
+    def setUp(self):
+        ori_shape = (1, 3, 1, 5)
+        axes = (0, 2)
+        new_shape = (3, 5)
+
+        self.op_type = "squeeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+# Correct: There is mins axis.
+class TestSqueezeOp2(OpTest):
+    def setUp(self):
+        ori_shape = (1, 3, 1, 5)
+        axes = (0, -2)
+        new_shape = (3, 5)
+
+        self.op_type = "squeeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+# Correct: No axes input.
+class TestSqueezeOp3(OpTest):
+    def setUp(self):
+        ori_shape = (1, 3, 1, 5)
+        axes = ()
+        new_shape = (3, 5)
+
+        self.op_type = "squeeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+# Correct: Just part of axes be squeezed. 
+class TestSqueezeOp4(OpTest):
+    def setUp(self):
+        ori_shape = (1, 3, 1, 5, 1, 4, 1)
+        axes = (2, 6)
+        new_shape = (1, 3, 5, 1, 4)
+
+        self.op_type = "squeeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+# Correct: Inplace.
+class TestSqueezeOpInplace1(OpTest):
+    def setUp(self):
+        ori_shape = (1, 3, 1, 5)
+        axes = (0, 2)
+        new_shape = (3, 5)
+
+        self.op_type = "squeeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inplace": True}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+# Correct: Inplace. There is mins axis.
+class TestSqueezeOpInplace2(OpTest):
+    def setUp(self):
+        ori_shape = (1, 3, 1, 5)
+        axes = (0, -2)
+        new_shape = (3, 5)
+
+        self.op_type = "squeeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": True}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+# Correct: Inplace. No axes input.
+class TestSqueezeOpInplace3(OpTest):
+    def setUp(self):
+        ori_shape = (1, 3, 1, 5)
+        axes = ()
+        new_shape = (3, 5)
+
+        self.op_type = "squeeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": True}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+# Correct: Inpalce. Just part of axes be squeezed. 
+class TestSqueezeOpInplace4(OpTest):
+    def setUp(self):
+        ori_shape = (1, 3, 1, 5, 1, 4, 1)
+        axes = (2, 6)
+        new_shape = (1, 3, 5, 1, 4)
+
+        self.op_type = "squeeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": True}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+if __name__ == "__main__":
+    unittest.main()

From e402496238cd1f2132f7a2e4f354404acdf6dcbb Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Mon, 2 Jul 2018 03:23:27 +0000
Subject: [PATCH 04/45] complete unsqueeze op and related unittest.

---
 paddle/fluid/operators/unsqueeze_op.cc        | 113 ++++++++++++------
 paddle/fluid/operators/unsqueeze_op.cu        |   4 +-
 .../tests/unittests/test_unsqueeze_op.py      |  98 +++++++++++++--
 3 files changed, 167 insertions(+), 48 deletions(-)

diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 8d2a186685..373dac8bab 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -32,42 +32,85 @@ class UnsqueezeOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of UnsqueezeOp should not be null.");
 
-    const auto& x_dims = ctx->GetInputDim("X");
     const auto& axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    // Check output tensor dims (<9).
-    PADDLE_ENFORCE_LE(x_dims.size() + axes.size(), 9,
-                      "Invalid dimnesions, dynamic dimensions must have "
-                      "between [1, 9] dimensions.");
-    // Check the range of unsqueeze aixs.
-    for (int a : axes) {
-      PADDLE_ENFORCE_LT(a, static_cast<int64_t>(x_dims.size() + axes.size()),
-                        "The axis must be less than output tensor's rank.");
+    PADDLE_ENFORCE(!axes.empty(),
+                   "The unsqueeze axes information must be set by Attr(axes).");
+
+    const auto& x_dims = ctx->GetInputDim("X");
+    // Validity Check: input tensor dims (<6).
+    PADDLE_ENFORCE(x_dims.size() < 6,
+                   "Invalid dimensions, dynamic dimensions should within "
+                   "[0, 5] dimensions (Eigen limit).");
+    // Validity Check: the range of unsqueeze aixs.
+    // TODO(chenweihang): Don't consider negative axis?.
+    for (unsigned int idx = 0; idx < axes.size(); ++idx) {
+      PADDLE_ENFORCE(axes[idx] < 6,
+                     "Invalid dimensions, input axis should within "
+                     "[0, 5] dimensions (Eigen limit).");
     }
 
     auto out_dims = GetOutputShape(axes, x_dims);
     ctx->SetOutputDim("Out", out_dims);
   }
 
-  static framework::DDim GetOutputShape(const std::vector<int> unsqueeze_dims,
+  static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
                                         const framework::DDim& in_dims) {
-    int out_dims_size = in_dims.size() + unsqueeze_dims.size();
-    bool should_unsqueeze[9] = {false};
-
-    // Determines the dimensions should be unsqueezed in output tensor after.
-    for (unsigned int idx = 0; idx < unsqueeze_dims.size(); ++idx) {
-      int current = unsqueeze_dims[idx] < 0
-                        ? unsqueeze_dims[idx] + out_dims_size
-                        : unsqueeze_dims[idx];
-      // Check current index.
-      PADDLE_ENFORCE_GE(current, 0,
-                        "Invaild axis, negative axis is out of range.");
-      should_unsqueeze[idx] = true;
+    /*
+     * STL version
+     * Test Error! don't know why?.
+    std::vector<int64_t> output_shape;
+
+    // Contruct base output shape
+    for(int idx = 0; idx < in_dims.size(); ++idx) {
+      output_shape.emplace_back(in_dims[idx]);
+    }
+    // Validity Check: output dimensions limit.
+    PADDLE_ENFORCE(unsqz_dims.size() + output_shape.size() < 6,
+                   "The Attr(axes) size is too large. The output shape should "
+                   "be less than 6 (Eigne limit).");
+    // Insert the unsqueeze axis in turn.
+    auto it = output_shape.begin();
+    for (int axis : unsqz_dims) {
+      int cur = axis < 0 ? (axis + output_shape.size() + 1)
+                         : axis;
+      // Vaildity Check: the axis bound
+      PADDLE_ENFORCE(cur >= 0 && cur <= static_cast<int>(output_shape.size()),
+                     "The unsqueeze dims must be within range of current
+    rank.");
+      output_shape.emplace(it + axis, 1);
+    }
+    */
+
+    unsigned int unsqz_mask = 0;
+    unsigned int front = 0, back = 0;
+    int output_dims_size = in_dims.size();
+
+    // Simulate insert by bit calc.
+    for (int axis : unsqz_dims) {
+      int cur = axis < 0 ? axis + output_dims_size + 1 : axis;
+      // Vaildity Check: the axis bound
+      PADDLE_ENFORCE(
+          cur >= 0 && cur <= output_dims_size,
+          "The unsqueeze dims must be within range of current rank.");
+      // Save the front part.
+      front = unsqz_mask & ((1 << axis) - 1);
+      // Move the back part.
+      back = unsqz_mask & ~((1 << axis) - 1);
+      back <<= 1;
+      // Merge two part.
+      back |= (1 << axis);
+      unsqz_mask = front | back;
+      // Add the output size.
+      output_dims_size++;
+      // Validity Check: rank range.
+      PADDLE_ENFORCE(output_dims_size < 6,
+                     "The output tensor's rank should be less than 6.");
     }
 
-    // Make output dimensions
-    std::vector<int64_t> output_shape(out_dims_size, 0);
-    for (int in_idx = 0, out_idx = 0; out_idx < out_dims_size; ++out_idx) {
-      if (!should_unsqueeze[out_idx]) {
+    // Make output shape
+    std::vector<int64_t> output_shape(output_dims_size, 0);
+    for (int in_idx = 0, out_idx = 0; out_idx < output_dims_size; ++out_idx) {
+      if ((unsqz_mask & (1 << out_idx)) == 0) {
         output_shape[out_idx] = in_dims[in_idx++];
       } else {
         output_shape[out_idx] = 1;
@@ -94,15 +137,15 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
         "tensor is created, and its data are copied from Input(x).")
         .SetDefault(false);
     AddComment(R"DOC(
-		Unsqueeze Operator.
-		
-		Insert single-dimensional entries to the shape of a tensor. 
-		Takes one required argument axes, a list of dimensions that will be inserted. 
-		Dimension indices in axes are as seen in the output tensor. 
-
-		For example: 
-		  Given a tensor such that tensor with shape [3, 4, 5], 
-		  then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
+    Unsqueeze Operator.
+    
+    Insert single-dimensional entries to the shape of a tensor. 
+    Takes one required argument axes, a list of dimensions that will be inserted. 
+    Dimension indices in axes are as seen in the output tensor. 
+
+    For example: 
+      Given a tensor such that tensor with shape [3, 4, 5], 
+      then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/unsqueeze_op.cu b/paddle/fluid/operators/unsqueeze_op.cu
index 891f6cc548..4d111190cd 100644
--- a/paddle/fluid/operators/unsqueeze_op.cu
+++ b/paddle/fluid/operators/unsqueeze_op.cu
@@ -18,12 +18,12 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    squeeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
+    unsqueeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
-    squeeze_grad,
+    unsqueeze_grad,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 273a2c075f..af273ca5a1 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ from op_test import OpTest
 
 
 # Correct: General.
-class TestSqueezeOp1(OpTest):
+class TestUnsqueezeOp(OpTest):
     def setUp(self):
         ori_shape = (3, 5)
         axes = (0, 2)
@@ -38,7 +38,7 @@ class TestSqueezeOp1(OpTest):
 
 
 # Correct: There is mins axis.
-class TestSqueezeOp2(OpTest):
+class TestUnsqueezeOp2(OpTest):
     def setUp(self):
         ori_shape = (3, 5)
         axes = (0, -2)
@@ -56,6 +56,82 @@ class TestSqueezeOp2(OpTest):
             self.check_grad(["X"], "Out")
 
 
+# Correct: There is duplicated axis.
+class TestUnsqueezeOp3(OpTest):
+    def setUp(self):
+        ori_shape = (3, 2, 5)
+        axes = (0, 3, 3)
+        new_shape = (1, 3, 2, 1, 1, 5)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+        def test_check_output(self):
+            self.check_output()
+
+        def test_check_grad(self):
+            self.check_grad(["X"], "Out")
+
+
+# Error: Output dimension is error.
+class TestUnsqueezeOp4(OpTest):
+    def setUp(self):
+        ori_shape = (3, 2, 5)
+        axes = (0, 3)
+        new_shape = (1, 3, 2, 2, 5)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+        def test_check_output(self):
+            self.check_output()
+
+        def test_check_grad(self):
+            self.check_grad(["X"], "Out")
+
+
+# Error: Input axes is invalid case 1.
+class TestUnsqueezeOp5(OpTest):
+    def setUp(self):
+        ori_shape = (3, 2, 5)
+        axes = (0, 5)
+        new_shape = (1, 3, 1, 5)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+        def test_check_output(self):
+            self.check_output()
+
+        def test_check_grad(self):
+            self.check_grad(["X"], "Out")
+
+
+# Error: Input axes is invalid case 2.
+class TestUnsqueezeOp5(OpTest):
+    def setUp(self):
+        ori_shape = (3, 2, 5)
+        axes = (0, 2, 10)
+        new_shape = (1, 3, 1, 5)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inpalce": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+        def test_check_output(self):
+            self.check_output()
+
+        def test_check_grad(self):
+            self.check_grad(["X"], "Out")
+
+
 # Correct: Inplace.
 class TestUnsqueezeOpInplace1(OpTest):
     def setUp(self):
@@ -75,23 +151,23 @@ class TestUnsqueezeOpInplace1(OpTest):
         self.check_grad(["X"], "Out")
 
 
-# Correct: Inplace. There is mins axis.
+# Correct: Inplace. There is duplicated axis.
 class TestUnsqueezeOpInplace2(OpTest):
     def setUp(self):
-        ori_shape = (3, 5)
-        axes = (0, -2)
-        new_shape = (1, 3, 1, 5)
+        ori_shape = (3, 2, 5)
+        axes = (0, 3, 3)
+        new_shape = (1, 3, 2, 1, 1, 5)
 
         self.op_type = "unsqueeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
         self.attrs = {"axes": axes, "inpalce": True}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
-    def test_check_output(self):
-        self.check_output()
+        def test_check_output(self):
+            self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        def test_check_grad(self):
+            self.check_grad(["X"], "Out")
 
 
 if __name__ == "__main__":

From 0cef33a4683835553e2c212b5d37e345964c5938 Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Tue, 3 Jul 2018 06:08:32 +0000
Subject: [PATCH 05/45] adjust the dims range to [1,6] and fix some problem

---
 paddle/fluid/operators/squeeze_op.cc          |  6 ++---
 paddle/fluid/operators/squeeze_op.cu          |  2 --
 .../fluid/tests/unittests/test_squeeze_op.py  | 26 +++++++++----------
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 639480aba4..29648cdd95 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -33,10 +33,10 @@ class SqueezeOp : public framework::OperatorWithKernel {
                    "Output(Out) of SqueezeOp should not be null.");
 
     const auto& x_dims = ctx->GetInputDim("X");
-    // Check input tensor dims (<9).
-    PADDLE_ENFORCE(x_dims.size() <= 9,
+    // Check input tensor dims (<6) Eigen limit.
+    PADDLE_ENFORCE(x_dims.size() <= 6,
                    "Invalid dimnesions, dynamic dimensions must have "
-                   "between [1, 9] dimensions.");
+                   "between [1, 6] dimensions (Eigen limit).");
 
     const auto& axes = ctx->Attrs().Get<std::vector<int>>("axes");
     for (int a : axes) {
diff --git a/paddle/fluid/operators/squeeze_op.cu b/paddle/fluid/operators/squeeze_op.cu
index 1096907daa..2752574502 100644
--- a/paddle/fluid/operators/squeeze_op.cu
+++ b/paddle/fluid/operators/squeeze_op.cu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
-
 #include "paddle/fluid/operators/squeeze_op.h"
 
 namespace ops = paddle::operators;
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index 58c87ea3c1..6ef5204b72 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -27,7 +27,7 @@ class TestSqueezeOp1(OpTest):
 
         self.op_type = "squeeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": False}
+        self.attrs = {"axes": axes, "inplace": False}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -46,7 +46,7 @@ class TestSqueezeOp2(OpTest):
 
         self.op_type = "squeeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": False}
+        self.attrs = {"axes": axes, "inplace": False}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -65,7 +65,7 @@ class TestSqueezeOp3(OpTest):
 
         self.op_type = "squeeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": False}
+        self.attrs = {"axes": axes, "inplace": False}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -78,13 +78,13 @@ class TestSqueezeOp3(OpTest):
 # Correct: Just part of axes be squeezed. 
 class TestSqueezeOp4(OpTest):
     def setUp(self):
-        ori_shape = (1, 3, 1, 5, 1, 4, 1)
-        axes = (2, 6)
-        new_shape = (1, 3, 5, 1, 4)
+        ori_shape = (3, 1, 5, 1, 4, 1)
+        axes = (1, -1)
+        new_shape = (3, 5, 1, 4)
 
         self.op_type = "squeeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": False}
+        self.attrs = {"axes": axes, "inplace": False}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -122,7 +122,7 @@ class TestSqueezeOpInplace2(OpTest):
 
         self.op_type = "squeeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": True}
+        self.attrs = {"axes": axes, "inplace": True}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -141,7 +141,7 @@ class TestSqueezeOpInplace3(OpTest):
 
         self.op_type = "squeeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": True}
+        self.attrs = {"axes": axes, "inplace": True}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -154,13 +154,13 @@ class TestSqueezeOpInplace3(OpTest):
 # Correct: Inpalce. Just part of axes be squeezed. 
 class TestSqueezeOpInplace4(OpTest):
     def setUp(self):
-        ori_shape = (1, 3, 1, 5, 1, 4, 1)
-        axes = (2, 6)
-        new_shape = (1, 3, 5, 1, 4)
+        ori_shape = (3, 1, 5, 1, 4, 1)
+        axes = (1, -1)
+        new_shape = (3, 5, 1, 4)
 
         self.op_type = "squeeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": True}
+        self.attrs = {"axes": axes, "inplace": True}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):

From 1854814d49602af42a9a630b732dfd1bf5749b1a Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Tue, 3 Jul 2018 14:55:42 +0800
Subject: [PATCH 06/45] Use reshape_op inside squeeze_op

* also convert tab to space
---
 paddle/fluid/operators/CMakeLists.txt |   1 +
 paddle/fluid/operators/squeeze_op.cc  | 142 +++++++++++++++-----------
 paddle/fluid/operators/squeeze_op.cu  |  30 ------
 paddle/fluid/operators/squeeze_op.h   |  72 -------------
 4 files changed, 85 insertions(+), 160 deletions(-)
 delete mode 100644 paddle/fluid/operators/squeeze_op.cu
 delete mode 100644 paddle/fluid/operators/squeeze_op.h

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index ab1d214333..c01d9bc384 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -265,6 +265,7 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(squeeze_op DEPS reshape_op)
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 639480aba4..26c3ea3449 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -12,33 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/squeeze_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-using framework::OpKernelType;
-using framework::Tensor;
-
-class SqueezeOp : public framework::OperatorWithKernel {
+class SqueezeOpInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SqueezeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SqueezeOp should not be null.");
 
-    const auto& x_dims = ctx->GetInputDim("X");
+    const auto &x_dims = ctx->GetInputDim("X");
     // Check input tensor dims (<9).
     PADDLE_ENFORCE(x_dims.size() <= 9,
                    "Invalid dimnesions, dynamic dimensions must have "
                    "between [1, 9] dimensions.");
 
-    const auto& axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
     for (int a : axes) {
       PADDLE_ENFORCE_LT(a, x_dims.size(),
                         "The axis must be less than input tensor's rank.");
@@ -55,7 +50,7 @@ class SqueezeOp : public framework::OperatorWithKernel {
   }
 
   static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
-                                        const framework::DDim& in_dims) {
+                                        const framework::DDim &in_dims) {
     int num_squeeze_dims = squeeze_dims.size();
     int cnt_squeezed_dims = 0;
     bool should_squeeze[9] = {false};
@@ -100,6 +95,31 @@ class SqueezeOp : public framework::OperatorWithKernel {
   }
 };
 
+class SqueezeOp : public framework::OperatorBase {
+ public:
+  SqueezeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
 class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -116,67 +136,73 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
                   "tensor is created, and its data are copied from Input(x).")
         .SetDefault(false);
     AddComment(R"DOC(
-		Squeeze Operator.
-		
-		Remove single-dimensional entries from the shape of a tensor. 
-		Takes a parameter axes with a list of axes to squeeze. 
-		If axes is not provided, all the single dimensions will be removed from the shape. 
+        Squeeze Operator.
+        
+        Remove single-dimensional entries from the shape of a tensor. 
+        Takes a parameter axes with a list of axes to squeeze. 
+        If axes is not provided, all the single dimensions will be removed from the shape. 
         If an axis is selected with shape entry not equal to one, an error is raised.
-		
-		Examples:
-		Case 1:
-		  Given 
-			X.shape = (1, 3, 1, 5)
-		  and
-			axes = [0]
-		  we get:
-			Out.shape = (3, 1, 5)
-
-		Case 2:
-		  Given
-			X.shape = (1, 3, 1, 5)
-		  we get:
-			Out.shape = (3, 5)
+        
+        Examples:
+        Case 1:
+          Given 
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = [0]
+          we get:
+            Out.shape = (3, 1, 5)
+
+        Case 2:
+          Given
+            X.shape = (1, 3, 1, 5)
+          we get:
+            Out.shape = (3, 5)
     )DOC");
   }
 };
 
-class SqueezeGradOp : public framework::OperatorWithKernel {
+class SqueezeGradInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SqueezeGradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Output(Out@GRAD) of SqueezeGradOp should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
   }
+};
 
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+class SqueezeGradOp : public framework::OperatorBase {
+ public:
+  SqueezeGradOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
+// Tell linker to use reshape op
+USE_OP(reshape);
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
+                  ops::SqueezeOpInferShape,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp);
-REGISTER_OP_CPU_KERNEL(
-    squeeze, ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    squeeze_grad,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
diff --git a/paddle/fluid/operators/squeeze_op.cu b/paddle/fluid/operators/squeeze_op.cu
deleted file mode 100644
index 1096907daa..0000000000
--- a/paddle/fluid/operators/squeeze_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-
-#include "paddle/fluid/operators/squeeze_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    squeeze, ops::SqueezeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze_grad,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
deleted file mode 100644
index 44ef324c7d..0000000000
--- a/paddle/fluid/operators/squeeze_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class SqueezeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
-    auto *in = ctx.Input<framework::LoDTensor>("X");
-
-    framework::DDim out_dims = out->dims();
-
-    bool inplace = ctx.Attr<bool>("inplace");
-    out->Resize(out_dims);
-    if (!inplace) {
-      out->mutable_data<T>(ctx.GetPlace());
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SqueezeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    bool inplace = ctx.Attr<bool>("inplace");
-
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      ctx.device_context().Wait();
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle

From ca1577939444cf702d4b131ac0afa8bfbad0211d Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Tue, 3 Jul 2018 03:58:48 +0000
Subject: [PATCH 07/45] rewrite, use reshape op in unsqueeze op, test passed

---
 paddle/fluid/operators/CMakeLists.txt         |   1 +
 paddle/fluid/operators/unsqueeze_op.cc        | 146 +++++++--------
 paddle/fluid/operators/unsqueeze_op.cu        |  30 ----
 paddle/fluid/operators/unsqueeze_op.h         |  72 --------
 .../tests/unittests/test_unsqueeze_op.py      | 167 ++++++++++++------
 5 files changed, 185 insertions(+), 231 deletions(-)
 delete mode 100644 paddle/fluid/operators/unsqueeze_op.cu
 delete mode 100644 paddle/fluid/operators/unsqueeze_op.h

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index ab1d214333..50f5f34021 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -265,6 +265,7 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(unsqueeze_op DEPS reshape_op)
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 373dac8bab..c503988676 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -12,41 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/unsqueeze_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-using framework::OpKernelType;
-using framework::Tensor;
-
-class UnsqueezeOp : public framework::OperatorWithKernel {
+class UnsqueezeOpInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of UnsqueezeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of UnsqueezeOp should not be null.");
 
-    const auto& axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
     PADDLE_ENFORCE(!axes.empty(),
                    "The unsqueeze axes information must be set by Attr(axes).");
 
-    const auto& x_dims = ctx->GetInputDim("X");
+    const auto &x_dims = ctx->GetInputDim("X");
     // Validity Check: input tensor dims (<6).
-    PADDLE_ENFORCE(x_dims.size() < 6,
+    PADDLE_ENFORCE(static_cast<int>(x_dims.size()) <= 6,
                    "Invalid dimensions, dynamic dimensions should within "
-                   "[0, 5] dimensions (Eigen limit).");
+                   "[1, 6] dimensions (Eigen limit).");
     // Validity Check: the range of unsqueeze aixs.
-    // TODO(chenweihang): Don't consider negative axis?.
-    for (unsigned int idx = 0; idx < axes.size(); ++idx) {
-      PADDLE_ENFORCE(axes[idx] < 6,
+    for (int axis : axes) {
+      PADDLE_ENFORCE(axis < 6,
                      "Invalid dimensions, input axis should within "
-                     "[0, 5] dimensions (Eigen limit).");
+                     "[1, 6] dimensions (Eigen limit).");
     }
 
     auto out_dims = GetOutputShape(axes, x_dims);
@@ -54,33 +48,7 @@ class UnsqueezeOp : public framework::OperatorWithKernel {
   }
 
   static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
-                                        const framework::DDim& in_dims) {
-    /*
-     * STL version
-     * Test Error! don't know why?.
-    std::vector<int64_t> output_shape;
-
-    // Contruct base output shape
-    for(int idx = 0; idx < in_dims.size(); ++idx) {
-      output_shape.emplace_back(in_dims[idx]);
-    }
-    // Validity Check: output dimensions limit.
-    PADDLE_ENFORCE(unsqz_dims.size() + output_shape.size() < 6,
-                   "The Attr(axes) size is too large. The output shape should "
-                   "be less than 6 (Eigne limit).");
-    // Insert the unsqueeze axis in turn.
-    auto it = output_shape.begin();
-    for (int axis : unsqz_dims) {
-      int cur = axis < 0 ? (axis + output_shape.size() + 1)
-                         : axis;
-      // Vaildity Check: the axis bound
-      PADDLE_ENFORCE(cur >= 0 && cur <= static_cast<int>(output_shape.size()),
-                     "The unsqueeze dims must be within range of current
-    rank.");
-      output_shape.emplace(it + axis, 1);
-    }
-    */
-
+                                        const framework::DDim &in_dims) {
     unsigned int unsqz_mask = 0;
     unsigned int front = 0, back = 0;
     int output_dims_size = in_dims.size();
@@ -93,17 +61,17 @@ class UnsqueezeOp : public framework::OperatorWithKernel {
           cur >= 0 && cur <= output_dims_size,
           "The unsqueeze dims must be within range of current rank.");
       // Save the front part.
-      front = unsqz_mask & ((1 << axis) - 1);
+      front = unsqz_mask & ((1 << cur) - 1);
       // Move the back part.
-      back = unsqz_mask & ~((1 << axis) - 1);
+      back = unsqz_mask & ~((1 << cur) - 1);
       back <<= 1;
       // Merge two part.
-      back |= (1 << axis);
+      back |= (1 << cur);
       unsqz_mask = front | back;
       // Add the output size.
       output_dims_size++;
       // Validity Check: rank range.
-      PADDLE_ENFORCE(output_dims_size < 6,
+      PADDLE_ENFORCE(output_dims_size <= 6,
                      "The output tensor's rank should be less than 6.");
     }
 
@@ -121,6 +89,31 @@ class UnsqueezeOp : public framework::OperatorWithKernel {
   }
 };
 
+class UnsqueezeOp : public framework::OperatorBase {
+ public:
+  UnsqueezeOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = UnsqueezeOpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+    // Invoke Reshape op.
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
 class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -150,42 +143,49 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-class UnsqueezeGradOp : public framework::OperatorWithKernel {
+class UnsqueezeGradInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of UnsqueezeGradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Output(Out@GRAD) of UnsqueezeGradOp should not be null.");
+  void operator()(framework::InferShapeContext *ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", framework::GradVarName("X"));
   }
+};
 
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+class UnsqueezeGradOp : public framework::OperatorBase {
+ public:
+  UnsqueezeGradOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
+// Tell linker to use reshape op.
+USE_OP(reshape);
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
+                  ops::UnsqueezeOpInferShape,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze_grad,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
+                  ops::UnsqueezeGradInferShape);
diff --git a/paddle/fluid/operators/unsqueeze_op.cu b/paddle/fluid/operators/unsqueeze_op.cu
deleted file mode 100644
index 4d111190cd..0000000000
--- a/paddle/fluid/operators/unsqueeze_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-
-#include "paddle/fluid/operators/unsqueeze_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze_grad,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
deleted file mode 100644
index aa45fb3113..0000000000
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class UnsqueezeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
-    auto *in = ctx.Input<framework::LoDTensor>("X");
-
-    framework::DDim out_dims = out->dims();
-
-    bool inplace = ctx.Attr<bool>("inplace");
-    out->Resize(out_dims);
-    if (!inplace) {
-      out->mutable_data<T>(ctx.GetPlace());
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UnsqueezeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    bool inplace = ctx.Attr<bool>("inplace");
-
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      ctx.device_context().Wait();
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index af273ca5a1..eff90f4618 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -27,7 +27,7 @@ class TestUnsqueezeOp(OpTest):
 
         self.op_type = "unsqueeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": False}
+        self.attrs = {"axes": axes, "inplace": False}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -37,23 +37,42 @@ class TestUnsqueezeOp(OpTest):
         self.check_grad(["X"], "Out")
 
 
-# Correct: There is mins axis.
+# Correct: Single input index.
+class TestUnsqueezeOp1(OpTest):
+    def setUp(self):
+        ori_shape = (3, 5)
+        axes = (-1, )
+        new_shape = (3, 5, 1)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+# Correct: Mixed input axis.
 class TestUnsqueezeOp2(OpTest):
     def setUp(self):
         ori_shape = (3, 5)
-        axes = (0, -2)
-        new_shape = (1, 3, 1, 5)
+        axes = (0, -1)
+        new_shape = (1, 3, 5, 1)
 
         self.op_type = "unsqueeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": False}
+        self.attrs = {"axes": axes, "inplace": False}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
-        def test_check_output(self):
-            self.check_output()
+    def test_check_output(self):
+        self.check_output()
 
-        def test_check_grad(self):
-            self.check_grad(["X"], "Out")
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
 
 
 # Correct: There is duplicated axis.
@@ -65,83 +84,84 @@ class TestUnsqueezeOp3(OpTest):
 
         self.op_type = "unsqueeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": False}
+        self.attrs = {"axes": axes, "inplace": False}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
-        def test_check_output(self):
-            self.check_output()
+    def test_check_output(self):
+        self.check_output()
 
-        def test_check_grad(self):
-            self.check_grad(["X"], "Out")
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
 
 
-# Error: Output dimension is error.
-class TestUnsqueezeOp4(OpTest):
+# Correct: Inplace.
+class TestUnsqueezeOpInplace1(OpTest):
     def setUp(self):
-        ori_shape = (3, 2, 5)
-        axes = (0, 3)
-        new_shape = (1, 3, 2, 2, 5)
+        ori_shape = (3, 5)
+        axes = (0, 2)
+        new_shape = (1, 3, 1, 5)
 
         self.op_type = "unsqueeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": False}
+        self.attrs = {"axes": axes, "inplace": True}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
-        def test_check_output(self):
-            self.check_output()
+    def test_check_output(self):
+        self.check_output()
 
-        def test_check_grad(self):
-            self.check_grad(["X"], "Out")
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
 
 
-# Error: Input axes is invalid case 1.
-class TestUnsqueezeOp5(OpTest):
+# Correct: Inplace. There is mins index.
+class TestUnsqueezeOpInplace2(OpTest):
     def setUp(self):
-        ori_shape = (3, 2, 5)
-        axes = (0, 5)
+        ori_shape = (3, 5)
+        axes = (0, -2)
         new_shape = (1, 3, 1, 5)
 
         self.op_type = "unsqueeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": False}
+        self.attrs = {"axes": axes, "inplace": True}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
-        def test_check_output(self):
-            self.check_output()
+    def test_check_output(self):
+        self.check_output()
 
-        def test_check_grad(self):
-            self.check_grad(["X"], "Out")
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
 
 
-# Error: Input axes is invalid case 2.
-class TestUnsqueezeOp5(OpTest):
+# Correct: Inplace. There is duplicated axis.
+class TestUnsqueezeOpInplace3(OpTest):
     def setUp(self):
         ori_shape = (3, 2, 5)
-        axes = (0, 2, 10)
-        new_shape = (1, 3, 1, 5)
+        axes = (0, 3, 3)
+        new_shape = (1, 3, 2, 1, 1, 5)
 
         self.op_type = "unsqueeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": False}
+        self.attrs = {"axes": axes, "inplace": True}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
-        def test_check_output(self):
-            self.check_output()
+    def test_check_output(self):
+        self.check_output()
 
-        def test_check_grad(self):
-            self.check_grad(["X"], "Out")
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
 
 
-# Correct: Inplace.
-class TestUnsqueezeOpInplace1(OpTest):
+'''
+# Error: Output dimension is error.
+class TestUnsqueezeOp4(OpTest):
     def setUp(self):
         ori_shape = (3, 5)
-        axes = (0, 2)
-        new_shape = (1, 3, 1, 5)
+        axes = (0, 3)
+        new_shape = (1, 3, 1, 1, 5)
 
         self.op_type = "unsqueeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": True}
+        self.attrs = {"axes": axes, "inplace": False}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -150,25 +170,60 @@ class TestUnsqueezeOpInplace1(OpTest):
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
 
-
-# Correct: Inplace. There is duplicated axis.
-class TestUnsqueezeOpInplace2(OpTest):
+# Error: Input axis is large than output range.
+class TestUnsqueezeOp5(OpTest):
     def setUp(self):
-        ori_shape = (3, 2, 5)
-        axes = (0, 3, 3)
-        new_shape = (1, 3, 2, 1, 1, 5)
+        ori_shape = (3, 5)
+        axes = (0, 4)
+        new_shape = (1, 3, 5, 1)
 
         self.op_type = "unsqueeze"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inpalce": True}
+        self.attrs = {"axes": axes, "inplace": False}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
-        def test_check_output(self):
-            self.check_output()
+    def test_check_output(self):
+        self.check_output()
 
         def test_check_grad(self):
             self.check_grad(["X"], "Out")
 
+# Error: Input axes is large than Eigen limit.
+class TestUnsqueezeOp6(OpTest):
+    def setUp(self):
+        ori_shape = (3, 5)
+        axes = (0, 2, 10)
+        new_shape = (1, 3, 1, 5, 1)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+# Error: Input axes size is large than Eigen limit.
+class TestUnsqueezeOp7(OpTest):
+    def setUp(self):
+        ori_shape = (3, 5)
+        axes = (0, 2, 2, 2, 2, 2)
+        new_shape = (1, 3, 1, 1, 5, 1)
+
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"axes": axes, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+'''
 
 if __name__ == "__main__":
     unittest.main()

From 49b2cf5feee66010c6598f8d4fc49f1fc1f29048 Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Wed, 4 Jul 2018 09:44:39 +0000
Subject: [PATCH 08/45] adjust some code based reviewer's advice

---
 paddle/fluid/operators/unsqueeze_op.cc        |  30 ++-
 .../tests/unittests/test_unsqueeze_op.py      | 216 ++++--------------
 2 files changed, 60 insertions(+), 186 deletions(-)

diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index c503988676..62e45468ab 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -36,15 +36,13 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase {
     PADDLE_ENFORCE(static_cast<int>(x_dims.size()) <= 6,
                    "Invalid dimensions, dynamic dimensions should within "
                    "[1, 6] dimensions (Eigen limit).");
-    // Validity Check: the range of unsqueeze aixs.
-    for (int axis : axes) {
-      PADDLE_ENFORCE(axis < 6,
-                     "Invalid dimensions, input axis should within "
-                     "[1, 6] dimensions (Eigen limit).");
-    }
-
     auto out_dims = GetOutputShape(axes, x_dims);
     ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
   }
 
   static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
@@ -102,6 +100,8 @@ class UnsqueezeOp : public framework::OperatorBase {
     auto &axes = Attr<std::vector<int>>("axes");
     auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
     auto out_dims = UnsqueezeOpInferShape::GetOutputShape(axes, x_dims);
+    // auto out_dims =
+    // scope.FindVar(Output("Out"))->Get<framework::LoDTensor>().dims();
 
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(out_dims);
@@ -121,7 +121,19 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator.");
     AddAttr<std::vector<int>>("axes",
                               "(std::vector<int>). List of positive integers,"
-                              " indicate the dimensions to be inserted");
+                              " indicate the dimensions to be inserted")
+        .AddCustomChecker([](const std::vector<int> &axes) {
+          // Validity Check: axes dims (<6).
+          PADDLE_ENFORCE(static_cast<int>(axes.size()) < 6,
+                         "Invalid dimensions, dynamic dimensions should within "
+                         "[1, 6] dimensions (Eigen limit).");
+          // Validity Check: the range of unsqueeze aixs.
+          for (int axis : axes) {
+            PADDLE_ENFORCE(axis < 6,
+                           "Invalid dimensions, input axis should within "
+                           "[1, 6] dimensions (Eigen limit).");
+          }
+        });
     AddAttr<bool>(
         "inplace",
         "(default: false) Unsqueeze the source tensor's shape without "
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index eff90f4618..62dc6fcb9e 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -21,14 +21,11 @@ from op_test import OpTest
 # Correct: General.
 class TestUnsqueezeOp(OpTest):
     def setUp(self):
-        ori_shape = (3, 5)
-        axes = (0, 2)
-        new_shape = (1, 3, 1, 5)
-
+        self.init_test_case()
         self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"axes": self.axes, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -36,194 +33,59 @@ class TestUnsqueezeOp(OpTest):
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
 
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (1, 2)
+        self.new_shape = (3, 1, 1, 5)
 
-# Correct: Single input index.
-class TestUnsqueezeOp1(OpTest):
-    def setUp(self):
-        ori_shape = (3, 5)
-        axes = (-1, )
-        new_shape = (3, 5, 1)
-
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+# Correct: Single input index.
+class TestUnsqueezeOp1(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (-1, )
+        self.new_shape = (3, 5, 1)
 
 
 # Correct: Mixed input axis.
-class TestUnsqueezeOp2(OpTest):
-    def setUp(self):
-        ori_shape = (3, 5)
-        axes = (0, -1)
-        new_shape = (1, 3, 5, 1)
-
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestUnsqueezeOp2(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (0, -1)
+        self.new_shape = (1, 3, 5, 1)
 
 
 # Correct: There is duplicated axis.
-class TestUnsqueezeOp3(OpTest):
-    def setUp(self):
-        ori_shape = (3, 2, 5)
-        axes = (0, 3, 3)
-        new_shape = (1, 3, 2, 1, 1, 5)
-
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestUnsqueezeOp3(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 3, 2, 1, 1, 5)
 
 
 # Correct: Inplace.
-class TestUnsqueezeOpInplace1(OpTest):
-    def setUp(self):
-        ori_shape = (3, 5)
-        axes = (0, 2)
-        new_shape = (1, 3, 1, 5)
-
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": True}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestUnsqueezeOpInplace1(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (0, 2)
+        self.new_shape = (1, 3, 1, 5)
 
 
 # Correct: Inplace. There is mins index.
-class TestUnsqueezeOpInplace2(OpTest):
-    def setUp(self):
-        ori_shape = (3, 5)
-        axes = (0, -2)
-        new_shape = (1, 3, 1, 5)
-
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": True}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestUnsqueezeOpInplace2(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (0, -2)
+        self.new_shape = (1, 3, 1, 5)
 
 
 # Correct: Inplace. There is duplicated axis.
-class TestUnsqueezeOpInplace3(OpTest):
-    def setUp(self):
-        ori_shape = (3, 2, 5)
-        axes = (0, 3, 3)
-        new_shape = (1, 3, 2, 1, 1, 5)
-
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": True}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-'''
-# Error: Output dimension is error.
-class TestUnsqueezeOp4(OpTest):
-    def setUp(self):
-        ori_shape = (3, 5)
-        axes = (0, 3)
-        new_shape = (1, 3, 1, 1, 5)
-
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-# Error: Input axis is large than output range.
-class TestUnsqueezeOp5(OpTest):
-    def setUp(self):
-        ori_shape = (3, 5)
-        axes = (0, 4)
-        new_shape = (1, 3, 5, 1)
-
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+class TestUnsqueezeOpInplace3(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 3, 2, 1, 1, 5)
 
-    def test_check_output(self):
-        self.check_output()
-
-        def test_check_grad(self):
-            self.check_grad(["X"], "Out")
-
-# Error: Input axes is large than Eigen limit.
-class TestUnsqueezeOp6(OpTest):
-    def setUp(self):
-        ori_shape = (3, 5)
-        axes = (0, 2, 10)
-        new_shape = (1, 3, 1, 5, 1)
-
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-# Error: Input axes size is large than Eigen limit.
-class TestUnsqueezeOp7(OpTest):
-    def setUp(self):
-        ori_shape = (3, 5)
-        axes = (0, 2, 2, 2, 2, 2)
-        new_shape = (1, 3, 1, 1, 5, 1)
-
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-'''
 
 if __name__ == "__main__":
     unittest.main()

From 927d793746d1dc2e63ebcb7f6ce93845190ec1aa Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Thu, 5 Jul 2018 11:17:37 +0000
Subject: [PATCH 09/45] simplify test case

---
 .../fluid/tests/unittests/test_squeeze_op.py  | 172 ++++++------------
 1 file changed, 56 insertions(+), 116 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index 6ef5204b72..bca6af2fd5 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -19,16 +19,13 @@ from op_test import OpTest
 
 
 # Correct: General.
-class TestSqueezeOp1(OpTest):
+class TestSqueezeOp(OpTest):
     def setUp(self):
-        ori_shape = (1, 3, 1, 5)
-        axes = (0, 2)
-        new_shape = (3, 5)
-
         self.op_type = "squeeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -36,138 +33,81 @@ class TestSqueezeOp1(OpTest):
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
 
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, 2)
+        self.new_shape = (3, 5)
 
-# Correct: There is mins axis.
-class TestSqueezeOp2(OpTest):
-    def setUp(self):
-        ori_shape = (1, 3, 1, 5)
-        axes = (0, -2)
-        new_shape = (3, 5)
-
-        self.op_type = "squeeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": False}
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+# Correct: There is mins axis.
+class TestSqueezeOp1(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, -2)
+        self.new_shape = (3, 5)
 
 
 # Correct: No axes input.
-class TestSqueezeOp3(OpTest):
-    def setUp(self):
-        ori_shape = (1, 3, 1, 5)
-        axes = ()
-        new_shape = (3, 5)
-
-        self.op_type = "squeeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestSqueezeOp2(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = ()
+        self.new_shape = (3, 5)
 
 
 # Correct: Just part of axes be squeezed. 
-class TestSqueezeOp4(OpTest):
-    def setUp(self):
-        ori_shape = (3, 1, 5, 1, 4, 1)
-        axes = (1, -1)
-        new_shape = (3, 5, 1, 4)
-
-        self.op_type = "squeeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestSqueezeOp3(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (3, 5, 1, 4)
 
 
 # Correct: Inplace.
-class TestSqueezeOpInplace1(OpTest):
-    def setUp(self):
-        ori_shape = (1, 3, 1, 5)
-        axes = (0, 2)
-        new_shape = (3, 5)
+class TestSqueezeOpInplace1(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, 2)
+        self.new_shape = (3, 5)
 
-        self.op_type = "squeeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": True}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
 
 
 # Correct: Inplace. There is mins axis.
-class TestSqueezeOpInplace2(OpTest):
-    def setUp(self):
-        ori_shape = (1, 3, 1, 5)
-        axes = (0, -2)
-        new_shape = (3, 5)
+class TestSqueezeOpInplace2(TestSqueezeOp):
+    def inti_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, -2)
+        self.new_shape = (3, 5)
 
-        self.op_type = "squeeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": True}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
 
 
 # Correct: Inplace. No axes input.
-class TestSqueezeOpInplace3(OpTest):
-    def setUp(self):
-        ori_shape = (1, 3, 1, 5)
-        axes = ()
-        new_shape = (3, 5)
-
-        self.op_type = "squeeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": True}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
+class TestSqueezeOpInplace3(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = ()
+        self.new_shape = (3, 5)
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
 
 
 # Correct: Inpalce. Just part of axes be squeezed. 
-class TestSqueezeOpInplace4(OpTest):
-    def setUp(self):
-        ori_shape = (3, 1, 5, 1, 4, 1)
-        axes = (1, -1)
-        new_shape = (3, 5, 1, 4)
-
-        self.op_type = "squeeze"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"axes": axes, "inplace": True}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestSqueezeOpInplace4(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (3, 5, 1, 4)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
 
 
 if __name__ == "__main__":

From 80126a7496cc1d0c4568d7b8e5cc92c1f8bf5904 Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Fri, 6 Jul 2018 06:38:24 +0000
Subject: [PATCH 10/45] small fix based reviewer's advice

---
 paddle/fluid/operators/unsqueeze_op.cc             |  6 +++---
 .../fluid/tests/unittests/test_unsqueeze_op.py     | 14 +++++++++++++-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 62e45468ab..d950da6a75 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -28,9 +28,6 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase {
                    "Output(Out) of UnsqueezeOp should not be null.");
 
     const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    PADDLE_ENFORCE(!axes.empty(),
-                   "The unsqueeze axes information must be set by Attr(axes).");
-
     const auto &x_dims = ctx->GetInputDim("X");
     // Validity Check: input tensor dims (<6).
     PADDLE_ENFORCE(static_cast<int>(x_dims.size()) <= 6,
@@ -123,6 +120,9 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
                               "(std::vector<int>). List of positive integers,"
                               " indicate the dimensions to be inserted")
         .AddCustomChecker([](const std::vector<int> &axes) {
+          PADDLE_ENFORCE(
+              !axes.empty(),
+              "The unsqueeze axes information must be set by Attr(axes).");
           // Validity Check: axes dims (<6).
           PADDLE_ENFORCE(static_cast<int>(axes.size()) < 6,
                          "Invalid dimensions, dynamic dimensions should within "
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 62dc6fcb9e..d19d4e525a 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -24,7 +24,7 @@ class TestUnsqueezeOp(OpTest):
         self.init_test_case()
         self.op_type = "unsqueeze"
         self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.attrs = {"axes": self.axes, "inplace": False}
+        self.init_attrs()
         self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
 
     def test_check_output(self):
@@ -38,6 +38,9 @@ class TestUnsqueezeOp(OpTest):
         self.axes = (1, 2)
         self.new_shape = (3, 1, 1, 5)
 
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": False}
+
 
 # Correct: Single input index.
 class TestUnsqueezeOp1(TestUnsqueezeOp):
@@ -70,6 +73,9 @@ class TestUnsqueezeOpInplace1(TestUnsqueezeOp):
         self.axes = (0, 2)
         self.new_shape = (1, 3, 1, 5)
 
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+
 
 # Correct: Inplace. There is mins index.
 class TestUnsqueezeOpInplace2(TestUnsqueezeOp):
@@ -78,6 +84,9 @@ class TestUnsqueezeOpInplace2(TestUnsqueezeOp):
         self.axes = (0, -2)
         self.new_shape = (1, 3, 1, 5)
 
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+
 
 # Correct: Inplace. There is duplicated axis.
 class TestUnsqueezeOpInplace3(TestUnsqueezeOp):
@@ -86,6 +95,9 @@ class TestUnsqueezeOpInplace3(TestUnsqueezeOp):
         self.axes = (0, 3, 3)
         self.new_shape = (1, 3, 2, 1, 1, 5)
 
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+
 
 if __name__ == "__main__":
     unittest.main()

From 5f89272c89befd113d1fa44e9055f47bcceb455e Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Mon, 9 Jul 2018 06:08:55 +0000
Subject: [PATCH 11/45] change the bit insert to array insert for
 understandability

---
 paddle/fluid/operators/unsqueeze_op.cc        | 57 ++++++++-----------
 .../tests/unittests/test_unsqueeze_op.py      |  8 +++
 2 files changed, 31 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index d950da6a75..960bc6f241 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -44,39 +44,37 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase {
 
   static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
                                         const framework::DDim &in_dims) {
-    unsigned int unsqz_mask = 0;
-    unsigned int front = 0, back = 0;
-    int output_dims_size = in_dims.size();
+    int output_size = in_dims.size() + unsqz_dims.size();
+    int cur_output_size = in_dims.size();
+    std::vector<int64_t> output_shape(output_size, 0);
+
+    // Validity Check: rank range.
+    PADDLE_ENFORCE(output_size <= 6,
+                   "The output tensor's rank should be less than 6.");
 
-    // Simulate insert by bit calc.
     for (int axis : unsqz_dims) {
-      int cur = axis < 0 ? axis + output_dims_size + 1 : axis;
+      int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
       // Vaildity Check: the axis bound
       PADDLE_ENFORCE(
-          cur >= 0 && cur <= output_dims_size,
+          cur >= 0 && cur <= cur_output_size,
           "The unsqueeze dims must be within range of current rank.");
-      // Save the front part.
-      front = unsqz_mask & ((1 << cur) - 1);
-      // Move the back part.
-      back = unsqz_mask & ~((1 << cur) - 1);
-      back <<= 1;
-      // Merge two part.
-      back |= (1 << cur);
-      unsqz_mask = front | back;
+      // Move old axis, and insert new axis
+      for (int i = cur_output_size; i >= cur; --i) {
+        if (output_shape[i] == 1) {
+          // Move axis
+          output_shape[i + 1] = 1;
+          output_shape[i] = 0;
+        }
+      }
+      output_shape[cur] = 1;
       // Add the output size.
-      output_dims_size++;
-      // Validity Check: rank range.
-      PADDLE_ENFORCE(output_dims_size <= 6,
-                     "The output tensor's rank should be less than 6.");
+      cur_output_size++;
     }
 
     // Make output shape
-    std::vector<int64_t> output_shape(output_dims_size, 0);
-    for (int in_idx = 0, out_idx = 0; out_idx < output_dims_size; ++out_idx) {
-      if ((unsqz_mask & (1 << out_idx)) == 0) {
+    for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
+      if (output_shape[out_idx] == 0) {
         output_shape[out_idx] = in_dims[in_idx++];
-      } else {
-        output_shape[out_idx] = 1;
       }
     }
 
@@ -86,10 +84,7 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase {
 
 class UnsqueezeOp : public framework::OperatorBase {
  public:
-  UnsqueezeOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+  using OperatorBase::OperatorBase;
 
  private:
   void RunImpl(const framework::Scope &scope,
@@ -97,8 +92,6 @@ class UnsqueezeOp : public framework::OperatorBase {
     auto &axes = Attr<std::vector<int>>("axes");
     auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
     auto out_dims = UnsqueezeOpInferShape::GetOutputShape(axes, x_dims);
-    // auto out_dims =
-    // scope.FindVar(Output("Out"))->Get<framework::LoDTensor>().dims();
 
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(out_dims);
@@ -165,11 +158,7 @@ class UnsqueezeGradInferShape : public framework::InferShapeBase {
 
 class UnsqueezeGradOp : public framework::OperatorBase {
  public:
-  UnsqueezeGradOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+  using OperatorBase::OperatorBase;
 
  private:
   void RunImpl(const framework::Scope &scope,
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index d19d4e525a..7a4aa0a40b 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -66,6 +66,14 @@ class TestUnsqueezeOp3(TestUnsqueezeOp):
         self.new_shape = (1, 3, 2, 1, 1, 5)
 
 
+# Correct: Reversed axes.
+class TestUnsqueezeOp4(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (3, 1, 1, 2, 5, 1)
+
+
 # Correct: Inplace.
 class TestUnsqueezeOpInplace1(TestUnsqueezeOp):
     def init_test_case(self):

From 1721613f1ec1ba83b8df6cb140bea060bc667642 Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Mon, 9 Jul 2018 07:06:22 +0000
Subject: [PATCH 12/45] simplify construct function

---
 paddle/fluid/operators/squeeze_op.cc | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 55068e84d7..7c07cfdb7e 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -96,10 +96,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
 
 class SqueezeOp : public framework::OperatorBase {
  public:
-  SqueezeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+  using OperatorBase::OperatorBase;
 
  private:
   void RunImpl(const framework::Scope &scope,
@@ -171,11 +168,7 @@ class SqueezeGradInferShape : public framework::InferShapeBase {
 
 class SqueezeGradOp : public framework::OperatorBase {
  public:
-  SqueezeGradOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+  using OperatorBase::OperatorBase;
 
  private:
   void RunImpl(const framework::Scope &scope,

From f5894d22c505ab688c7970fb72681aeb5816fc63 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 9 Jul 2018 16:11:20 +0800
Subject: [PATCH 13/45] Fix a backward bug

---
 python/paddle/fluid/backward.py | 43 ++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 4faa063031..2357458244 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -136,29 +136,32 @@ def _addup_repetitive_outputs_(op_descs):
                     "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
                     {"use_mkldnn": False}), idx))
                 renamed_vars[var_name] = [var_name]
-        for var_name in op_desc.output_arg_names():
-            if var_name == core.empty_var_name(
-            ) or var_name in op_desc.input_arg_names():
-                # empty variable or inplace op
-                continue
-            if len(renamed_vars[var_name]) == 0:
-                # it's the first time we get the variable
-                renamed_vars[var_name] = [var_name]
-            else:
-                if len(renamed_vars[var_name]) == 1:
+        for param_name in op_desc.output_names():
+            arg_names = op_desc.output(param_name)
+            for arg_idx, var_name in enumerate(arg_names):
+                if var_name == core.empty_var_name(
+                ) or var_name in op_desc.input_arg_names():
+                    # empty variable or inplace op
+                    continue
+                if len(renamed_vars[var_name]) == 0:
+                    # it's the first time we get the variable
+                    renamed_vars[var_name] = [var_name]
+                else:
+                    if len(renamed_vars[var_name]) == 1:
+                        new_name = var_name + "@RENAME@" + \
+                            str(var_rename_count[var_name])
+                        var_rename_count[var_name] += 1
+                        # rename original var_name
+                        renamed_vars[var_name][0] = new_name
+                        _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                        _rename_arg_(pending_sum_ops, var_name, new_name)
+
                     new_name = var_name + "@RENAME@" + \
                         str(var_rename_count[var_name])
                     var_rename_count[var_name] += 1
-                    # rename original var_name
-                    renamed_vars[var_name][0] = new_name
-                    _rename_arg_(op_descs, var_name, new_name, 0, idx)
-                    _rename_arg_(pending_sum_ops, var_name, new_name)
-
-                new_name = var_name + "@RENAME@" + \
-                    str(var_rename_count[var_name])
-                var_rename_count[var_name] += 1
-                op_desc.rename_output(var_name, new_name)
-                renamed_vars[var_name].append(new_name)
+                    arg_names[arg_idx] = new_name
+                    op_desc.set_output(param_name, arg_names)
+                    renamed_vars[var_name].append(new_name)
     for var_name, inputs in renamed_vars.iteritems():
         if len(inputs) > 1:
             pending_sum_ops.append(

From 4922074edced12adefe94d9164f3ca07fc08e26f Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Mon, 9 Jul 2018 17:19:55 +0800
Subject: [PATCH 14/45] inference api symbol hidden

---
 paddle/contrib/inference/CMakeLists.txt           | 2 +-
 paddle/contrib/inference/paddle_inference_api.map | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)
 create mode 100644 paddle/contrib/inference/paddle_inference_api.map

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index c30eff5010..eda47aa4bf 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -51,7 +51,7 @@ cc_library(paddle_inference_api_shared SHARED
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc)
 set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
 if(NOT APPLE)
-  set(LINK_FLAGS "-fPIC -fvisibility=hidden")
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.map")
   set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
 endif()
 
diff --git a/paddle/contrib/inference/paddle_inference_api.map b/paddle/contrib/inference/paddle_inference_api.map
new file mode 100644
index 0000000000..5203784dc1
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api.map
@@ -0,0 +1,6 @@
+{
+	global:
+		*paddle*;
+	local:
+		*;
+};

From 3c9b59b8e8d787eb4d2c33e468f48048fdec2959 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 9 Jul 2018 19:00:53 +0800
Subject: [PATCH 15/45] update

---
 python/paddle/fluid/backward.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 2357458244..71a1653cf4 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -136,7 +136,7 @@ def _addup_repetitive_outputs_(op_descs):
                     "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
                     {"use_mkldnn": False}), idx))
                 renamed_vars[var_name] = [var_name]
-        for param_name in op_desc.output_names():
+        for param_idx, param_name in enumerate(op_desc.output_names()):
             arg_names = op_desc.output(param_name)
             for arg_idx, var_name in enumerate(arg_names):
                 if var_name == core.empty_var_name(
@@ -156,12 +156,26 @@ def _addup_repetitive_outputs_(op_descs):
                         _rename_arg_(op_descs, var_name, new_name, 0, idx)
                         _rename_arg_(pending_sum_ops, var_name, new_name)
 
+                        for p in op_desc.output_names()[:param_idx]:
+                            p_arg_names = op_desc.output(p)
+                            if var_name in p_arg_names:
+                                op_desc.set_output(p, [
+                                    new_name if x == var_name else x
+                                    for x in p_arg_names
+                                ])
+
+                        arg_names = [
+                            new_name if x == var_name else x
+                            for x in arg_names[:arg_idx]
+                        ] + arg_names[arg_idx:]
+
                     new_name = var_name + "@RENAME@" + \
                         str(var_rename_count[var_name])
                     var_rename_count[var_name] += 1
                     arg_names[arg_idx] = new_name
                     op_desc.set_output(param_name, arg_names)
                     renamed_vars[var_name].append(new_name)
+
     for var_name, inputs in renamed_vars.iteritems():
         if len(inputs) > 1:
             pending_sum_ops.append(

From 7f93def03000cb2795ddd7a7d6f8e2ef296030b6 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Mon, 9 Jul 2018 22:37:26 +0800
Subject: [PATCH 16/45] inference api static lib symbol hidden

---
 paddle/contrib/inference/CMakeLists.txt           | 4 ++++
 paddle/contrib/inference/paddle_inference_api.sym | 1 +
 2 files changed, 5 insertions(+)
 create mode 100644 paddle/contrib/inference/paddle_inference_api.sym

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index c30eff5010..17bf9c983a 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -45,6 +45,10 @@ endfunction(inference_api_test)
 cc_library(paddle_inference_api
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+if(NOT APPLE)
+  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.sym")
+  set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
 
 # Here the shared library doesn't depend on other fluid libraries, or double free will occur.
 cc_library(paddle_inference_api_shared SHARED
diff --git a/paddle/contrib/inference/paddle_inference_api.sym b/paddle/contrib/inference/paddle_inference_api.sym
new file mode 100644
index 0000000000..ef2a04d788
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api.sym
@@ -0,0 +1 @@
+*paddle*

From 2238ea56de2207fb5dfd1db63d6772301cfcc7e3 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Mon, 9 Jul 2018 22:58:11 +0800
Subject: [PATCH 17/45] paddle fluid static lib symbol hidden

---
 paddle/fluid/inference/CMakeLists.txt   | 6 ++++++
 paddle/fluid/inference/paddle_fluid.sym | 1 +
 2 files changed, 7 insertions(+)
 create mode 100644 paddle/fluid/inference/paddle_fluid.sym

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 1895aea7f9..b1c33c3415 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -13,6 +13,12 @@ endif()
 
 # Create static library
 cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
+if(NOT APPLE)
+  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
+  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
+  set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
+
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc
diff --git a/paddle/fluid/inference/paddle_fluid.sym b/paddle/fluid/inference/paddle_fluid.sym
new file mode 100644
index 0000000000..ef2a04d788
--- /dev/null
+++ b/paddle/fluid/inference/paddle_fluid.sym
@@ -0,0 +1 @@
+*paddle*

From d552b900f0c25b2a0f0eb34ec9ebf6d49a66342d Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Tue, 10 Jul 2018 02:08:09 +0000
Subject: [PATCH 18/45] change the copyright year form 2016 to 2018

---
 paddle/fluid/operators/squeeze_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 7c07cfdb7e..07555e98c3 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From c6fe794a9c983f6fa7d9b989ef906143bcff5f40 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 9 Jul 2018 19:23:10 +0800
Subject: [PATCH 19/45] add independent inference demo on teamcity

---
 paddle/contrib/inference/demo/CMakeLists.txt  |  2 -
 .../contrib/inference/demo_ci/CMakeLists.txt  | 72 +++++++++++++++++++
 paddle/contrib/inference/demo_ci/run.sh       | 31 ++++++++
 .../{demo => demo_ci}/simple_on_word2vec.cc   | 46 +++++++-----
 paddle/scripts/paddle_build.sh                | 15 +++-
 5 files changed, 145 insertions(+), 21 deletions(-)
 create mode 100644 paddle/contrib/inference/demo_ci/CMakeLists.txt
 create mode 100755 paddle/contrib/inference/demo_ci/run.sh
 rename paddle/contrib/inference/{demo => demo_ci}/simple_on_word2vec.cc (80%)

diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
index ecece6fe34..2d501bf008 100644
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -13,8 +13,6 @@
 # limitations under the License.
 #
 
-inference_api_test(simple_on_word2vec ARGS test_word2vec)
-
 option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
 if(NOT WITH_INFERENCE_DEMO)
   return()
diff --git a/paddle/contrib/inference/demo_ci/CMakeLists.txt b/paddle/contrib/inference/demo_ci/CMakeLists.txt
new file mode 100644
index 0000000000..0d175b840d
--- /dev/null
+++ b/paddle/contrib/inference/demo_ci/CMakeLists.txt
@@ -0,0 +1,72 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(cpp_inference_demo CXX C)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
+option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       ON)
+option(WITH_GPU        "Compile PaddlePaddle with GPU, default use CPU."                    OFF)
+
+if(WITH_GPU)
+  set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+endif()
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so 
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so)
+  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+  endif()
+else()
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
+endif()
+
+set(ARCHIVE_START "-Wl,--whole-archive")
+set(ARCHIVE_END "-Wl,--no-whole-archive")
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+
+set(DEPS
+    ${ARCHIVE_START}
+    ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a
+    ${ARCHIVE_END}
+    ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.a
+    ${MATH_LIB}
+    ${MKLDNN_LIB}
+    glog gflags protobuf snappystream snappy z
+    ${EXTERNAL_LIB})
+if(WITH_GPU)
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so)
+endif()
+
+target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/paddle/contrib/inference/demo_ci/run.sh b/paddle/contrib/inference/demo_ci/run.sh
new file mode 100755
index 0000000000..ad79bce450
--- /dev/null
+++ b/paddle/contrib/inference/demo_ci/run.sh
@@ -0,0 +1,31 @@
+set -x
+PADDLE_ROOT=$1
+WITH_MKL=$2
+WITH_GPU=$3
+
+mkdir -p build
+cd build
+rm -rf *
+
+cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+  -DWITH_MKL=$WITH_MKL \
+  -DDEMO_NAME=simple_on_word2vec \
+  -DWITH_GPU=$WITH_GPU
+make
+if [ $3 == "ON" ]; then
+  use_gpu_list='true false'
+else    
+  use_gpu_list='false'
+fi
+for use_gpu in $use_gpu_list; do
+  ./simple_on_word2vec \
+    --dirname=${PADDLE_ROOT}/build/python/paddle/fluid/tests/book/word2vec.inference.model \
+    --use_gpu=$use_gpu
+done
+if [ $? -eq 0 ]; then
+  exit 0
+else
+  echo "inference demo runs fail."
+  exit 1
+fi
+set +x
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo_ci/simple_on_word2vec.cc
similarity index 80%
rename from paddle/contrib/inference/demo/simple_on_word2vec.cc
rename to paddle/contrib/inference/demo_ci/simple_on_word2vec.cc
index c253014642..b3970e389e 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo_ci/simple_on_word2vec.cc
@@ -16,21 +16,27 @@ limitations under the License. */
  * This file contains a simple demo for how to take a model for inference.
  */
 
+#include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <gtest/gtest.h>
 #include <memory>
 #include <thread>
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_bool(use_gpu, false, "Whether use gpu.");
 
 namespace paddle {
 namespace demo {
 
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
 void Main(bool use_gpu) {
   //# 1. Create PaddlePredictor with a config.
   NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  if (FLAGS_dirname.empty()) {
+    LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model";
+    exit(1);
+  }
+  config.model_dir = FLAGS_dirname;
   config.use_gpu = use_gpu;
   config.fraction_of_gpu_memory = 0.15;
   config.device = 0;
@@ -54,7 +60,7 @@ void Main(bool use_gpu) {
     CHECK(predictor->Run(slots, &outputs));
 
     //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
+    PADDLE_ENFORCE(outputs.size(), 1UL);
     LOG(INFO) << "output buffer size: " << outputs.front().data.length();
     const size_t num_elements = outputs.front().data.length() / sizeof(float);
     // The outputs' buffers are in CPU memory.
@@ -68,7 +74,7 @@ void MainThreads(int num_threads, bool use_gpu) {
   // Multi-threads only support on CPU
   // 0. Create PaddlePredictor with a config.
   NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.model_dir = FLAGS_dirname;
   config.use_gpu = use_gpu;
   config.fraction_of_gpu_memory = 0.15;
   config.device = 0;
@@ -94,7 +100,7 @@ void MainThreads(int num_threads, bool use_gpu) {
         CHECK(predictor->Run(inputs, &outputs));
 
         // 4. Get output.
-        ASSERT_EQ(outputs.size(), 1UL);
+        PADDLE_ENFORCE(outputs.size(), 1UL);
         LOG(INFO) << "TID: " << tid << ", "
                   << "output buffer size: " << outputs.front().data.length();
         const size_t num_elements =
@@ -111,15 +117,19 @@ void MainThreads(int num_threads, bool use_gpu) {
   }
 }
 
-TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
-#endif
-
 }  // namespace demo
 }  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main(false /* use_gpu*/);
+  paddle::demo::MainThreads(1, false /* use_gpu*/);
+  paddle::demo::MainThreads(4, false /* use_gpu*/);
+  if (FLAGS_use_gpu) {
+    LOG(INFO) << "use_gpu=true";
+    paddle::demo::Main(true /*use_gpu*/);
+    paddle::demo::MainThreads(1, true /*use_gpu*/);
+    paddle::demo::MainThreads(4, true /*use_gpu*/);
+  }
+  return 0;
+}
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index d173b41e86..bf45c11a9d 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -510,11 +510,23 @@ function gen_fluid_inference_lib() {
 EOF
         make -j `nproc` inference_lib_dist
         cd ${PADDLE_ROOT}/build
-        mv fluid_install_dir fluid
+        cp -r fluid_install_dir fluid
         tar -cf fluid.tgz fluid
       fi
 }
 
+function test_fluid_inference_lib() {
+    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+        cat <<EOF
+    ========================================
+    Testing fluid inference library ...
+    ========================================
+EOF
+        cd ${PADDLE_ROOT}/paddle/contrib/inference/demo_ci
+        sh run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF}
+      fi
+}
+
 function main() {
     set -e
     local CMD=$1
@@ -568,6 +580,7 @@ function main() {
         run_test
         gen_capi_package
         gen_fluid_inference_lib
+        test_fluid_inference_lib
         ;;
       *)
         print_usage

From 07ab9ce4e7622b01a3acd3f71b9e9b4f8b09b7fe Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 10 Jul 2018 15:25:00 +0800
Subject: [PATCH 20/45] update the comments

---
 python/paddle/fluid/backward.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 71a1653cf4..e7a065599e 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -123,7 +123,8 @@ def _append_grad_suffix_(name):
 def _addup_repetitive_outputs_(op_descs):
     """
     In backward part, an variable may be the output of more than one ops.
-    In this case, the variable should be the accumulation of all the outputs.
+    And one op may yield its multiple outputs to the same variable.
+    In these cases, the variable should be the accumulation of all the outputs.
     `sum_op`s are added to implement the accumulate.
     """
     pending_sum_ops = []

From 0a445da631fc8539b9bb4018c60ee1aa1ddb9bde Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Tue, 10 Jul 2018 15:43:03 +0800
Subject: [PATCH 21/45] Make scope_buffered_ssa_graph_executor Exception safe

---
 .../details/scope_buffered_ssa_graph_executor.cc  | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index eb4e7ec52f..1d80bab90f 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <stdexcept>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
@@ -53,8 +54,14 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       }
     }
   }
+  std::vector<framework::LoDTensor> fetch_data;
+  std::exception_ptr eptr;
+  try {
+    fetch_data = underlying_executor_->Run(fetch_tensors);
+  } catch (...) {
+    eptr = std::current_exception();
+  }
 
-  auto fetch_data = underlying_executor_->Run(fetch_tensors);
   drop_scope_counter_ += 1;
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
@@ -69,7 +76,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       scope->DeleteScope(local_scope);
     }
   }
-  return fetch_data;
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  } else {
+    return fetch_data;
+  }
 }
 }  // namespace details
 }  // namespace framework

From 3aaf798182f02312deaf933fe7f049e735738204 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Tue, 10 Jul 2018 15:47:35 +0800
Subject: [PATCH 22/45] Refine size_t and int

---
 .../fluid/operators/reader/create_batch_reader_op.cc   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index 1dbafd23e9..e17c2ffd39 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -23,7 +23,7 @@ class BatchReader : public framework::DecoratedReader {
   BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size,
               bool discard_leftover)
       : DecoratedReader(reader),
-        batch_size_(batch_size),
+        batch_size_(static_cast<size_t>(batch_size)),
         discard_leftover_(discard_leftover) {
     buffer_.reserve(batch_size_);
   }
@@ -31,7 +31,7 @@ class BatchReader : public framework::DecoratedReader {
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
  private:
-  int batch_size_;
+  size_t batch_size_;
   bool discard_leftover_;
   std::vector<std::vector<framework::LoDTensor>> buffer_;
 };
@@ -78,7 +78,7 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
 void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   buffer_.clear();
   buffer_.reserve(batch_size_);
-  for (int i = 0; i < batch_size_; ++i) {
+  for (size_t i = 0; i < batch_size_; ++i) {
     buffer_.push_back(std::vector<framework::LoDTensor>());
     reader_->ReadNext(&buffer_.back());
     if (buffer_.back().empty()) {
@@ -95,9 +95,9 @@ void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
     // if buffer_ is empty, the 'out' will return as an empty vector.
     return;
   }
-  int out_num = buffer_[0].size();
+  size_t out_num = buffer_[0].size();
   out->reserve(out_num);
-  for (int j = 0; j < out_num; ++j) {
+  for (size_t j = 0; j < out_num; ++j) {
     // Merge shape and check date type
     std::type_index batch_type = buffer_[0][j].type();
     framework::DDim batch_shape = buffer_[0][j].dims();

From c822d0309bc410e382bc869e46027111d5d6c6f2 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Tue, 10 Jul 2018 16:02:24 +0800
Subject: [PATCH 23/45] Refine code

---
 .../framework/details/threaded_ssa_graph_executor.cc  | 11 ++++++++---
 .../framework/details/threaded_ssa_graph_executor.h   |  3 +++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 99b10254a7..8a8c3a5938 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -78,6 +78,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     set.clear();
   };
 
+  // Clean run context
+  run_op_futures_.clear();
+  exception_.reset();
+
   // Step 3. Execution
   while (!pending_vars.empty()) {
     // 1. Run All Ready ops
@@ -98,14 +102,15 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     if (timeout) {
       std::lock_guard<std::mutex> l(exception_mu_);
       if (exception_) {
+        for (auto &run_op_future : run_op_futures_) {
+          run_op_future.wait();
+        }
         std::exception *exp = exception_.get();
         if (dynamic_cast<platform::EOFException *>(exp)) {
           auto e = *static_cast<platform::EOFException *>(exp);
-          exception_.reset();
           throw e;
         } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
           auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          exception_.reset();
           throw e;
         } else {
           LOG(FATAL) << "Unknown exception.";
@@ -222,7 +227,7 @@ void ThreadedSSAGraphExecutor::RunOp(
     }
   };
   if (pool_) {
-    pool_->enqueue(op_run);
+    run_op_futures_.emplace_back(pool_->enqueue(op_run));
   } else {
     op_run();
   }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index c69e0487e2..09973b7a72 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <deque>
+#include <list>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -77,6 +78,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
 
  private:
   ExecutionStrategy strategy_;
+  // use std::list because clear(), push_back, and for_each are O(1)
+  std::list<std::future<void>> run_op_futures_;
 };
 
 }  // namespace details

From 6c83dcd622000756d470202ff957907861920f26 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Tue, 10 Jul 2018 16:37:40 +0800
Subject: [PATCH 24/45] Hide get_places. Mark it as deprecated

---
 python/paddle/fluid/annotations.py            | 38 +++++++++++++++++++
 python/paddle/fluid/layers/device.py          |  4 +-
 .../tests/book/notest_understand_sentiment.py |  4 +-
 .../fluid/tests/book/test_recognize_digits.py | 18 +++++----
 .../paddle/fluid/tests/book/test_word2vec.py  |  3 +-
 .../test_memopt_fit_a_line.py                 |  9 +++--
 .../tests/unittests/test_get_places_op.py     |  3 +-
 .../fluid/tests/unittests/test_layers.py      |  3 +-
 .../fluid/tests/unittests/test_parallel_op.py |  3 +-
 9 files changed, 66 insertions(+), 19 deletions(-)
 create mode 100644 python/paddle/fluid/annotations.py

diff --git a/python/paddle/fluid/annotations.py b/python/paddle/fluid/annotations.py
new file mode 100644
index 0000000000..bb8756a466
--- /dev/null
+++ b/python/paddle/fluid/annotations.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import sys
+
+__all__ = ['deprecated']
+
+
+def deprecated(since, instead, extra_message=""):
+    def decorator(func):
+        err_msg = "API {0} is deprecated since {1}. Please use {2} instead.".format(
+            func.__name__, since, instead)
+        if len(extra_message) != 0:
+            err_msg += "\n"
+            err_msg += extra_message
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            print >> sys.stderr, err_msg
+            return func(*args, **kwargs)
+
+        wrapper.__doc__ += "\n    "
+        wrapper.__doc__ += err_msg
+        return wrapper
+
+    return decorator
diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py
index e0c1aab230..384d302a70 100644
--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -18,10 +18,12 @@ All util layers.
 from layer_function_generator import autodoc
 from ..framework import unique_name
 from ..layer_helper import LayerHelper
+from ..annotations import deprecated
 
-__all__ = ['get_places']
+__all__ = []
 
 
+@deprecated(since='0.15.0', instead="ParallelExecutor")
 @autodoc()
 def get_places(device_count=None, device_type=None):
     helper = LayerHelper('get_places', **locals())
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 1df7b99aad..95002aa7f9 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
-
+from paddle.fluid.layers.device import get_places
 import unittest
 import paddle.fluid as fluid
 import paddle
@@ -144,7 +144,7 @@ def train(word_dict,
         cost, acc_out, prediction = net_method(
             data, label, input_dim=dict_dim, class_dim=class_dim)
     else:
-        places = fluid.layers.get_places()
+        places = get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             cost, acc, _ = net_method(
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 5f5c8544bb..49f549fa18 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -12,15 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
-import argparse
-import paddle.fluid as fluid
-import paddle
-import sys
-import numpy
-import unittest
+
 import math
-import sys
 import os
+import sys
+import unittest
+
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 
 BATCH_SIZE = 64
 
@@ -76,7 +78,7 @@ def train(nn_type,
         net_conf = conv_net
 
     if parallel:
-        places = fluid.layers.get_places()
+        places = get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             img_ = pd.read_input(img)
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 49bd72c7a5..80e0692bc6 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -14,6 +14,7 @@
 
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import unittest
 import os
 import numpy as np
@@ -80,7 +81,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
-        places = fluid.layers.get_places()
+        places = get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             avg_cost, predict_word = __network__(
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index be347cd531..bec9f8594f 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import paddle
-import paddle.fluid as fluid
 import math
 import sys
 
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
+
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
 # version.
@@ -34,7 +35,7 @@ if fluid.core.is_compiled_with_cuda():
     use_nccl = False
     place = fluid.CUDAPlace(0)
 
-places = fluid.layers.get_places(device_count=0, device_type=device_type)
+places = get_places(device_count=0, device_type=device_type)
 pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
 with pd.do():
     x_ = pd.read_input(x)
diff --git a/python/paddle/fluid/tests/unittests/test_get_places_op.py b/python/paddle/fluid/tests/unittests/test_get_places_op.py
index 6dab1e22f0..964423e2d2 100644
--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import decorators
 import unittest
 
@@ -20,7 +21,7 @@ import unittest
 class TestGetPlaces(unittest.TestCase):
     @decorators.prog_scope()
     def test_get_places(self):
-        places = fluid.layers.get_places()
+        places = get_places()
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
         exe.run(fluid.default_main_program())
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 842d34c07e..f99ddc9bc4 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 
 import paddle.fluid.layers as layers
+from paddle.fluid.layers.device import get_places
 import paddle.fluid.nets as nets
 from paddle.fluid.framework import Program, program_guard, default_main_program
 from paddle.fluid.param_attr import ParamAttr
@@ -238,7 +239,7 @@ class TestBook(unittest.TestCase):
     def test_get_places(self):
         program = Program()
         with program_guard(program):
-            x = layers.get_places(device_count=4)
+            x = get_places(device_count=4)
             self.assertIsNotNone(x)
         print(str(program))
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
index 9ba5f988f3..9ec05e0297 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -15,6 +15,7 @@
 import unittest
 
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import paddle.fluid.profiler as profiler
 import numpy
 
@@ -115,7 +116,7 @@ class BaseParallelForTest(unittest.TestCase):
             if use_parallel:
                 thread_num = fluid.core.get_cuda_device_count(
                 ) if use_gpu else 8
-                places = fluid.layers.get_places(thread_num)
+                places = get_places(thread_num)
                 pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
                 data = next(generator)
 

From 05eafcca7373d690916244db0890370d64d7b535 Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Tue, 10 Jul 2018 08:41:36 +0000
Subject: [PATCH 25/45] refine some messages and adjust data type

---
 paddle/fluid/operators/squeeze_op.cc | 29 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 1b656ea138..805f198bf3 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -30,13 +30,14 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
     const auto &x_dims = ctx->GetInputDim("X");
     // Check input tensor dims (<6) Eigen limit.
     PADDLE_ENFORCE(x_dims.size() <= 6,
-                   "Invalid dimnesions, dynamic dimensions must have "
-                   "between [1, 6] dimensions (Eigen limit).");
+                   "Invalid dimnesions, the rank of Input(X) "
+                   "should be in the range of [1, 6] (Eigen limit).");
 
     const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
     for (int a : axes) {
       PADDLE_ENFORCE_LT(a, x_dims.size(),
-                        "The axis must be less than input tensor's rank.");
+                        "The squeeze axis should be less than input "
+                        "tensor's rank.");
     }
 
     auto out_dims = GetOutputShape(axes, x_dims);
@@ -50,30 +51,29 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
 
   static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
                                         const framework::DDim &in_dims) {
-    int num_squeeze_dims = static_cast<int>(squeeze_dims.size());
+    size_t num_squeeze_dims = squeeze_dims.size();
     int cnt_squeezed_dims = 0;
     bool should_squeeze[9] = {false};
 
     // Determines number of dimensions of output tensor after squeeze.
     // Mark and count the dimensions need to be squeezed
     if (num_squeeze_dims == 0) {
-      for (int idx = 0; idx < static_cast<int>(in_dims.size()); ++idx) {
+      for (int idx = 0; idx < in_dims.size(); ++idx) {
         if (in_dims[idx] == 1) {
           should_squeeze[idx] = true;
           ++cnt_squeezed_dims;
         }
       }
     } else {
-      for (int idx = 0; idx < num_squeeze_dims; ++idx) {
+      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
         int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
                                             : squeeze_dims[idx];
-        // Check current index.
+        // Check current index, the upper limit has beed checked in line 36.
         PADDLE_ENFORCE(current >= 0,
-                       "Invalid axis, negative axis is out of range.");
-        // PADDLE_ENFORCE_LT(current, in_dims.size(), "Invalid axis is given.");
-        PADDLE_ENFORCE(
-            in_dims[current] == 1,
-            "Invalid axis index, the axis will be squeezed should be 1.");
+                       "Invalid axis, the negative axis is out of range.");
+        PADDLE_ENFORCE(in_dims[current] == 1,
+                       "Invalid axis index, the axis that will be squeezed "
+                       "should equal 1.");
 
         if (!(should_squeeze[current])) {
           ++cnt_squeezed_dims;
@@ -84,8 +84,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
 
     // Make output dimensions
     std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
-    for (int in_idx = 0, out_idx = 0; in_idx < static_cast<int>(in_dims.size());
-         ++in_idx) {
+    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
       if (!should_squeeze[in_idx]) {
         output_shape[out_idx++] = in_dims[in_idx];
       }
@@ -123,7 +122,7 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor). The input tensor of squeeze operator.");
     AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
     AddAttr<std::vector<int>>("axes",
-                              "(std::vector<int>). List of positive integers,"
+                              "(std::vector<int>). List of integers,"
                               " indicate the dimensions to squeeze.")
         .SetDefault({});
     AddAttr<bool>("inplace",

From cef8dbc1f7867d013046227f8283ee249bda8a0f Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Tue, 10 Jul 2018 09:09:55 +0000
Subject: [PATCH 26/45] refine some messages and adjust data type

---
 paddle/fluid/operators/unsqueeze_op.cc | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 5e089d77f4..da542aa852 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -30,9 +30,9 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase {
     const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
     const auto &x_dims = ctx->GetInputDim("X");
     // Validity Check: input tensor dims (<6).
-    PADDLE_ENFORCE(static_cast<int>(x_dims.size()) <= 6,
-                   "Invalid dimensions, dynamic dimensions should within "
-                   "[1, 6] dimensions (Eigen limit).");
+    PADDLE_ENFORCE(x_dims.size() <= 6,
+                   "Invalid dimensions, the rank of Input(X) "
+                   "should be in the range of [1, 6] (Eigen limit)");
     auto out_dims = GetOutputShape(axes, x_dims);
     ctx->SetOutputDim("Out", out_dims);
     if (x_dims[0] == out_dims[0]) {
@@ -44,8 +44,8 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase {
 
   static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
                                         const framework::DDim &in_dims) {
-    int output_size = static_cast<int>(in_dims.size() + unsqz_dims.size());
-    int cur_output_size = static_cast<int>(in_dims.size());
+    int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
+    int cur_output_size = in_dims.size();
     std::vector<int64_t> output_shape(output_size, 0);
 
     // Validity Check: rank range.
@@ -110,12 +110,11 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor). The input tensor of unsqueeze operator.");
     AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator.");
     AddAttr<std::vector<int>>("axes",
-                              "(std::vector<int>). List of positive integers,"
+                              "(std::vector<int>). List of integers,"
                               " indicate the dimensions to be inserted")
         .AddCustomChecker([](const std::vector<int> &axes) {
-          PADDLE_ENFORCE(
-              !axes.empty(),
-              "The unsqueeze axes information must be set by Attr(axes).");
+          PADDLE_ENFORCE(!axes.empty(),
+                         "Invalid axes, The unsqueeze axes is empty.");
           // Validity Check: axes dims (<6).
           PADDLE_ENFORCE(static_cast<int>(axes.size()) < 6,
                          "Invalid dimensions, dynamic dimensions should within "

From f8a74ccc7b9005d9a73e806d91241137563301de Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 10 Jul 2018 17:48:01 +0800
Subject: [PATCH 27/45] add shared library test

---
 .../contrib/inference/demo_ci/CMakeLists.txt  | 28 ++++++++++-------
 paddle/contrib/inference/demo_ci/run.sh       | 31 ++++++++++---------
 .../inference/demo_ci/simple_on_word2vec.cc   | 18 +++++++----
 3 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/paddle/contrib/inference/demo_ci/CMakeLists.txt b/paddle/contrib/inference/demo_ci/CMakeLists.txt
index 0d175b840d..09aace2d8a 100644
--- a/paddle/contrib/inference/demo_ci/CMakeLists.txt
+++ b/paddle/contrib/inference/demo_ci/CMakeLists.txt
@@ -11,9 +11,9 @@ if(NOT DEFINED DEMO_NAME)
   message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
 endif()
 
-option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
-option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       ON)
-option(WITH_GPU        "Compile PaddlePaddle with GPU, default use CPU."                    OFF)
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
 
 if(WITH_GPU)
   set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
@@ -52,17 +52,21 @@ else()
   set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
 endif()
 
-set(ARCHIVE_START "-Wl,--whole-archive")
-set(ARCHIVE_END "-Wl,--no-whole-archive")
+if(WITH_STATIC_LIB)
+  set(DEPS
+      "-Wl,--whole-archive"
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a
+      "-Wl,--no-whole-archive"
+      ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.a)
+else()
+  set(DEPS
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so
+      ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.so)
+endif()
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 
-set(DEPS
-    ${ARCHIVE_START}
-    ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a
-    ${ARCHIVE_END}
-    ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.a
-    ${MATH_LIB}
-    ${MKLDNN_LIB}
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
     glog gflags protobuf snappystream snappy z
     ${EXTERNAL_LIB})
 if(WITH_GPU)
diff --git a/paddle/contrib/inference/demo_ci/run.sh b/paddle/contrib/inference/demo_ci/run.sh
index ad79bce450..e33e939463 100755
--- a/paddle/contrib/inference/demo_ci/run.sh
+++ b/paddle/contrib/inference/demo_ci/run.sh
@@ -2,25 +2,28 @@ set -x
 PADDLE_ROOT=$1
 WITH_MKL=$2
 WITH_GPU=$3
-
-mkdir -p build
-cd build
-rm -rf *
-
-cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
-  -DWITH_MKL=$WITH_MKL \
-  -DDEMO_NAME=simple_on_word2vec \
-  -DWITH_GPU=$WITH_GPU
-make
 if [ $3 == "ON" ]; then
   use_gpu_list='true false'
 else    
   use_gpu_list='false'
 fi
-for use_gpu in $use_gpu_list; do
-  ./simple_on_word2vec \
-    --dirname=${PADDLE_ROOT}/build/python/paddle/fluid/tests/book/word2vec.inference.model \
-    --use_gpu=$use_gpu
+
+mkdir -p build
+cd build
+
+for WITH_STATIC_LIB in true false; do
+  rm -rf *
+  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    -DWITH_MKL=$WITH_MKL \
+    -DDEMO_NAME=simple_on_word2vec \
+    -DWITH_GPU=$WITH_GPU \
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+  make
+  for use_gpu in $use_gpu_list; do
+    ./simple_on_word2vec \
+      --dirname=${PADDLE_ROOT}/build/python/paddle/fluid/tests/book/word2vec.inference.model \
+      --use_gpu=$use_gpu
+  done
 done
 if [ $? -eq 0 ]; then
   exit 0
diff --git a/paddle/contrib/inference/demo_ci/simple_on_word2vec.cc b/paddle/contrib/inference/demo_ci/simple_on_word2vec.cc
index b3970e389e..9713837f86 100644
--- a/paddle/contrib/inference/demo_ci/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo_ci/simple_on_word2vec.cc
@@ -61,11 +61,15 @@ void Main(bool use_gpu) {
 
     //# 4. Get output.
     PADDLE_ENFORCE(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    // Check the output buffer size and result of each tid.
+    PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+    float result[5] = {
+        0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706};
     const size_t num_elements = outputs.front().data.length() / sizeof(float);
     // The outputs' buffers are in CPU memory.
     for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+      PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                     result[i]);
     }
   }
 }
@@ -101,13 +105,16 @@ void MainThreads(int num_threads, bool use_gpu) {
 
         // 4. Get output.
         PADDLE_ENFORCE(outputs.size(), 1UL);
-        LOG(INFO) << "TID: " << tid << ", "
-                  << "output buffer size: " << outputs.front().data.length();
+        // Check the output buffer size and result of each tid.
+        PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+        float result[5] = {
+            0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706};
         const size_t num_elements =
             outputs.front().data.length() / sizeof(float);
         // The outputs' buffers are in CPU memory.
         for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-          LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+          PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                         result[i]);
         }
       }
     });
@@ -126,7 +133,6 @@ int main(int argc, char** argv) {
   paddle::demo::MainThreads(1, false /* use_gpu*/);
   paddle::demo::MainThreads(4, false /* use_gpu*/);
   if (FLAGS_use_gpu) {
-    LOG(INFO) << "use_gpu=true";
     paddle::demo::Main(true /*use_gpu*/);
     paddle::demo::MainThreads(1, true /*use_gpu*/);
     paddle::demo::MainThreads(4, true /*use_gpu*/);

From b065522d6ce415bd4aeed5944387ad3615deb681 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 10 Jul 2018 18:15:20 +0800
Subject: [PATCH 28/45] add some symbol check

---
 paddle/contrib/inference/CMakeLists.txt  | 11 +++++++++++
 paddle/contrib/inference/check_symbol.sh | 12 ++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100755 paddle/contrib/inference/check_symbol.sh

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index eda47aa4bf..bb09649354 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -53,6 +53,17 @@ set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_
 if(NOT APPLE)
   set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.map")
   set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n"
+    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
+    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
+    "endif()\n")
+  add_custom_target(check_symbol ALL)
+  add_custom_command(
+      TARGET check_symbol
+      COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+      DEPENDS paddle_inference_api_shared)
 endif()
 
 cc_test(test_paddle_inference_api
diff --git a/paddle/contrib/inference/check_symbol.sh b/paddle/contrib/inference/check_symbol.sh
new file mode 100755
index 0000000000..6547ca1413
--- /dev/null
+++ b/paddle/contrib/inference/check_symbol.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+lib=$1
+if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
+
+num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l)
+num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l)
+
+if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
+if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
+
+exit 0

From 57b30c2be3b401c95e080d7bb98892a5af599fc6 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 10 Jul 2018 21:43:51 +0800
Subject: [PATCH 29/45] fix compiler error when using shared library, disable
 testing static library

---
 paddle/contrib/inference/demo_ci/CMakeLists.txt | 5 +++--
 paddle/contrib/inference/demo_ci/run.sh         | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/contrib/inference/demo_ci/CMakeLists.txt b/paddle/contrib/inference/demo_ci/CMakeLists.txt
index 09aace2d8a..789bff7f23 100644
--- a/paddle/contrib/inference/demo_ci/CMakeLists.txt
+++ b/paddle/contrib/inference/demo_ci/CMakeLists.txt
@@ -59,9 +59,10 @@ if(WITH_STATIC_LIB)
       "-Wl,--no-whole-archive"
       ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.a)
 else()
+  # Note: libpaddle_inference_api.so must put before libpaddle_fluid.so
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so
-      ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.so)
+      ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.so
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
 endif()
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 
diff --git a/paddle/contrib/inference/demo_ci/run.sh b/paddle/contrib/inference/demo_ci/run.sh
index e33e939463..e3a7269af7 100755
--- a/paddle/contrib/inference/demo_ci/run.sh
+++ b/paddle/contrib/inference/demo_ci/run.sh
@@ -11,7 +11,7 @@ fi
 mkdir -p build
 cd build
 
-for WITH_STATIC_LIB in true false; do
+for WITH_STATIC_LIB in false; do
   rm -rf *
   cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
     -DWITH_MKL=$WITH_MKL \

From eeaf7673b8cb7f1354a0a280c19050a1607eab4c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 10 Jul 2018 22:28:06 +0800
Subject: [PATCH 30/45] fix depends

---
 paddle/contrib/inference/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 75dbfb6255..8562bea0ed 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -61,11 +61,11 @@ if(NOT APPLE)
     "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
     "  message(FATAL_ERROR \"Check symbol failed.\")\n"
     "endif()\n")
-  add_custom_target(check_symbol ALL)
   add_custom_command(
-      TARGET check_symbol
-      COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.check_symbol
+      COMMAND ${CMAKE_COMMAND} -P ${cmake_current_binary_dir}/check_symbol.cmake
       DEPENDS paddle_inference_api_shared)
+  add_custom_target(check_symbol ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/.check_symbol)
 endif()
 
 cc_test(test_paddle_inference_api

From e4e0ffdbf2c218c952045ba36502bd5adc350dca Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 11 Jul 2018 10:42:05 +0800
Subject: [PATCH 31/45] fix check_symbol cmake error

---
 paddle/contrib/inference/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 8562bea0ed..bfd38eb22f 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -62,10 +62,10 @@ if(NOT APPLE)
     "  message(FATAL_ERROR \"Check symbol failed.\")\n"
     "endif()\n")
   add_custom_command(
-      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.check_symbol
-      COMMAND ${CMAKE_COMMAND} -P ${cmake_current_binary_dir}/check_symbol.cmake
-      DEPENDS paddle_inference_api_shared)
-  add_custom_target(check_symbol ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/.check_symbol)
+    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
+    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
+    DEPENDS paddle_inference_api_shared)
+  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()
 
 cc_test(test_paddle_inference_api

From 5e725dc52b37669c53966a4ea3436fecf3f0be91 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Wed, 11 Jul 2018 13:40:19 +0800
Subject: [PATCH 32/45] Hide Optimizer methods

---
 python/paddle/fluid/optimizer.py              | 24 +++++++++----------
 .../fluid/tests/unittests/test_optimizer.py   | 22 ++++++++---------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 75ee40fa9c..e2acf6d41a 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -29,7 +29,7 @@ __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
     'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
-    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer'
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'RMSPropOptimizer'
 ]
 
 
@@ -67,7 +67,7 @@ class Optimizer(object):
         self._LARS_weight_decay = LARS_weight_decay
 
     def _create_global_learning_rate(self):
-        lr = self.global_learning_rate()
+        lr = self._global_learning_rate()
 
         if isinstance(lr, framework.Variable):
             return
@@ -86,7 +86,7 @@ class Optimizer(object):
             dtype='float32' if self._dtype == None else self._dtype,
             persistable=True)
 
-    def global_learning_rate(self, program=None):
+    def _global_learning_rate(self, program=None):
         """
         get global decayed learning rate
         :return:
@@ -110,9 +110,9 @@ class Optimizer(object):
             return param_lr
         else:
             if param_lr == 1.0:
-                return self.global_learning_rate()
+                return self._global_learning_rate()
             else:
-                return self.global_learning_rate() * param_lr
+                return self._global_learning_rate() * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -185,10 +185,10 @@ class Optimizer(object):
                             format(name, param.name))
         return self._accumulators[name][param.name]
 
-    def create_optimization_pass(self,
-                                 parameters_and_grads,
-                                 loss,
-                                 startup_program=None):
+    def _create_optimization_pass(self,
+                                  parameters_and_grads,
+                                  loss,
+                                  startup_program=None):
         """Add optimization operators to update gradients to variables.
 
         Args:
@@ -221,7 +221,7 @@ class Optimizer(object):
             self._create_global_learning_rate()
             if self._LARS_weight_decay > 0.0:
                 layers.append_LARS(parameters_and_grads,
-                                   self.global_learning_rate(),
+                                   self._global_learning_rate(),
                                    self._LARS_weight_decay)
 
             optimize_ops = []
@@ -262,8 +262,8 @@ class Optimizer(object):
         params_grads = append_regularization_ops(params_grads,
                                                  self.regularization)
 
-        optimize_ops = self.create_optimization_pass(params_grads, loss,
-                                                     startup_program)
+        optimize_ops = self._create_optimization_pass(params_grads, loss,
+                                                      startup_program)
         return optimize_ops, params_grads
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 7286c7c450..43385691bb 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -97,7 +97,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer.create_optimization_pass(
+        opts = momentum_optimizer._create_optimization_pass(
             params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         sgd_op = opts[-1]
@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer.create_optimization_pass(
+        opts = momentum_optimizer._create_optimization_pass(
             params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         sgd_op = opts[-1]
@@ -214,8 +214,8 @@ class TestAdagradOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
-        opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                          init_program)
+        opts = adagrad_optimizer._create_optimization_pass(
+            params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         self.assertEqual([op.type for op in opts],
                          ["fill_constant", "elementwise_mul", "adagrad"])
@@ -278,8 +278,8 @@ class TestAdamOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
-        opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                       init_program)
+        opts = adam_optimizer._create_optimization_pass(params_grads, mul_out,
+                                                        init_program)
         self.assertEqual(len(opts), 5)
         self.assertEqual(
             [op.type for op in opts],
@@ -345,8 +345,8 @@ class TestAdamaxOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
-        opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                         init_program)
+        opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out,
+                                                          init_program)
         self.assertEqual(len(opts), 4)
         self.assertEqual(
             [op.type for op in opts],
@@ -409,7 +409,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
-        opts = decayed_adagrad_optimizer.create_optimization_pass(
+        opts = decayed_adagrad_optimizer._create_optimization_pass(
             params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         self.assertEqual(
@@ -475,8 +475,8 @@ class TestFtrlOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
-        opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                       init_program)
+        opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out,
+                                                        init_program)
         self.assertEqual(len(opts), 3)
         self.assertEqual([op.type for op in opts],
                          ["fill_constant", "elementwise_mul", "ftrl"])

From ff07af8d933d3c7062616a67c1837b2672467f8a Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Wed, 11 Jul 2018 13:42:48 +0800
Subject: [PATCH 33/45] Hide calc_gradient

---
 python/paddle/fluid/backward.py                           | 5 +----
 python/paddle/fluid/tests/unittests/test_calc_gradient.py | 2 --
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 4faa063031..834edf8441 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -18,10 +18,7 @@ import collections
 import copy
 import unique_name
 
-__all__ = [
-    'append_backward',
-    'calc_gradient',
-]
+__all__ = ['append_backward']
 
 
 def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index 06e676cd83..7f2a9e6971 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -16,8 +16,6 @@ import unittest
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
-import paddle.fluid.framework as framework
-import paddle.fluid.optimizer as optimizer
 from paddle.fluid.backward import calc_gradient
 
 

From 3d159689583696757167c02815cd1859364649b2 Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Wed, 11 Jul 2018 06:23:32 +0000
Subject: [PATCH 34/45] docs: fix some errors of description

---
 paddle/fluid/operators/unsqueeze_op.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index da542aa852..f2a15fdf57 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -111,19 +111,19 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator.");
     AddAttr<std::vector<int>>("axes",
                               "(std::vector<int>). List of integers,"
-                              " indicate the dimensions to be inserted")
+                              " indicating the dimensions to be inserted")
         .AddCustomChecker([](const std::vector<int> &axes) {
           PADDLE_ENFORCE(!axes.empty(),
                          "Invalid axes, The unsqueeze axes is empty.");
           // Validity Check: axes dims (<6).
           PADDLE_ENFORCE(static_cast<int>(axes.size()) < 6,
-                         "Invalid dimensions, dynamic dimensions should within "
-                         "[1, 6] dimensions (Eigen limit).");
+                         "Invalid dimensions, dynamic dimensions should be "
+                         "within [1, 6] dimensions (Eigen limit).");
           // Validity Check: the range of unsqueeze aixs.
           for (int axis : axes) {
             PADDLE_ENFORCE(axis < 6,
-                           "Invalid dimensions, input axis should within "
-                           "[1, 6] dimensions (Eigen limit).");
+                           "Invalid dimensions, input axis should be"
+                           " within [1, 6] dimensions (Eigen limit).");
           }
         });
     AddAttr<bool>(

From 0ea468225bc1fff74f1e6193ffd418aca4d1856c Mon Sep 17 00:00:00 2001
From: chenweihang <sunny_cwh@163.com>
Date: Wed, 11 Jul 2018 06:35:29 +0000
Subject: [PATCH 35/45] docs: fix some errors of description

---
 paddle/fluid/operators/squeeze_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 805f198bf3..6c507baf3a 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -73,7 +73,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
                        "Invalid axis, the negative axis is out of range.");
         PADDLE_ENFORCE(in_dims[current] == 1,
                        "Invalid axis index, the axis that will be squeezed "
-                       "should equal 1.");
+                       "should be equal to 1.");
 
         if (!(should_squeeze[current])) {
           ++cnt_squeezed_dims;
@@ -123,7 +123,7 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
     AddAttr<std::vector<int>>("axes",
                               "(std::vector<int>). List of integers,"
-                              " indicate the dimensions to squeeze.")
+                              " indicating the dimensions to squeeze.")
         .SetDefault({});
     AddAttr<bool>("inplace",
                   "(default: false) Squeeze the source tensor's shape without "

From 2bb43ede487b1c0df185664f2ef4b334f2f512e3 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 11 Jul 2018 15:36:19 +0800
Subject: [PATCH 36/45] add rpath to _swig_paddle.so

---
 python/setup.py.in | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/setup.py.in b/python/setup.py.in
index 52138b414e..38a3873430 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -181,6 +181,14 @@ else:
     command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
 if os.system(command) != 0:
     raise Exception("patch core.so failed, command: %s" % command)
+if '${WITH_FLUID_ONLY}'== 'OFF':
+    # change rpath of _swig_paddle.so.
+    if "@APPLE@" == "1":
+        command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+    else:
+        command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+    if os.system(command) != 0:
+        raise Exception("patch _swig_paddle.so failed, command: %s" % command)
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',

From fbe25ef510bfa929729eddc4e1a011a6ffd2299f Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Wed, 11 Jul 2018 09:37:07 +0200
Subject: [PATCH 37/45] MKLDNN: Extending Conv MKLDNN op to reuse MKLDNN
 primitives (#11750)

* - Rebase of conv reuse

- clag formatter fixes

- Fix to conv reuse

- Yet another fix

- Fix

- Fix

- clagn format

* - comment update
---
 paddle/fluid/operators/conv_mkldnn_op.cc | 172 ++++++++++++++++-------
 paddle/fluid/platform/mkldnn_helper.h    |  17 +--
 2 files changed, 129 insertions(+), 60 deletions(-)

diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 6b06913d1c..5bfa1aaa69 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -29,6 +29,79 @@ using mkldnn::stream;
 using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
 
+class ConvMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  ConvMKLDNNHandler(
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
+    conv_pd_ = conv_pd;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
+    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
+                                            "@dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {
+    auto src_pd = conv_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
+                               pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {
+    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
+    auto weights_pd = conv_pd_->weights_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_weights_pd,
+                               user_weights_memory_p, "@weights_mem_p",
+                               pipeline);
+  }
+
+  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto prim_desc_key = key_ + "@conv_pd";
+    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<mkldnn::convolution_forward>(
+          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
+          *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
+  // Generate keys for storing/retriving primitives for this operator
+  // TODO(jczaja): Make hashing function more optimial
+  static std::string GetHash(memory::dims& input_dims,
+                             memory::dims& weights_dims,
+                             std::vector<int>& strides,
+                             std::vector<int>& paddings,
+                             std::vector<int>& dilations, int groups,
+                             const std::string& suffix) {
+    return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
+           dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
+           suffix;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd_;
+};
+
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -36,10 +109,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
 
-    // Get unique name for index
-    const std::string key = ctx.op().Output("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
-
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -80,68 +149,62 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory = memory(
-        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
-        to_void_cast(input_data));
-    auto user_weights_memory =
-        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
-                mkldnn_engine},
-               to_void_cast(filter_data));
+    // Get unique name for storing MKLDNN primitives
+    const std::string key = ConvMKLDNNHandler::GetHash(
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Output("Output"));
+    const std::string key_conv_pd = key + "@conv_pd";
+
+    std::vector<primitive> pipeline;
+
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+    auto user_weights_md = platform::MKLDNNMemDesc(
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
 
     /* create memory descriptor for convolution without specified format
      * ('any') which lets a primitive (convolution in this case) choose
      * the memory format preferred for best performance
      */
-    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
-                                          memory::format::any);
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, memory::data_type::f32, memory::format::any);
-    auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
-                                          memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+    auto dst_md = platform::MKLDNNMemDesc(
+        dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
 
     // create a conv primitive descriptor and save it for usage in backward
     std::shared_ptr<conv_fwd::primitive_desc> conv_pd = ConvFwdPrimitiveDesc(
         src_md, weights_md, dst_md, strides, paddings, mkldnn_engine);
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    dev_ctx.SetBlob(key_conv_pd, conv_pd);
 
-    // create reorder primitive if the input format is not the preferred one
-    auto src_memory = user_src_memory;
-    primitive reorder_src;
-    bool is_src_reordered = false;
-    if (memory::primitive_desc(conv_pd->src_primitive_desc()) !=
-        user_src_memory.get_primitive_desc()) {
-      src_memory = memory(conv_pd->src_primitive_desc());
-      reorder_src = reorder(user_src_memory, src_memory);
-      is_src_reordered = true;
-    }
-    auto weights_memory = user_weights_memory;
-    primitive reorder_weights;
-    bool is_weights_reordered = false;
-    if (memory::primitive_desc(conv_pd->weights_primitive_desc()) !=
-        user_weights_memory.get_primitive_desc()) {
-      weights_memory = memory(conv_pd->weights_primitive_desc());
-      reorder_weights = reorder(user_weights_memory, weights_memory);
-      is_weights_reordered = true;
-    }
+    ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
 
-    // create memory primitive for conv dst
-    auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data);
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory_p =
+        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
+    auto user_weights_memory_p = handler.AcquireWeightsMemory(
+        user_weights_md, to_void_cast<T>(filter_data));
+
+    // create reorder primitive if the input format is not the preferred one
+    auto src_memory_p =
+        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
+    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
+        user_weights_memory_p, pipeline);
+    auto dst_memory_p =
+        handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
 
     // create convolution op primitive
-    auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory);
+    auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                             dst_memory_p);
 
     // push primitive to stream and wait until it's executed
-    std::vector<primitive> pipeline;
-    if (is_src_reordered) pipeline.push_back(reorder_src);
-    if (is_weights_reordered) pipeline.push_back(reorder_weights);
-    pipeline.push_back(conv_prim);
+    pipeline.push_back(*conv_p);
     stream(stream::kind::eager).submit(pipeline).wait();
 
-    // Save conv_pd/src_memory/weights_memory for backward pass
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
-
     output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(dst_memory));
+    output->set_format(GetMKLDNNFormat(*dst_memory_p));
   }
 
  private:
@@ -197,13 +260,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     if (!input_grad && !filter_grad) return;
 
-    // Get an unique name from "argument" name of "Output" variable
-    // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Input("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
-
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
@@ -223,6 +283,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
+    // Get an unique name from "argument" name of "Output" variable
+    // This name will be used as key when saving info into device context
+    const std::string key =
+        ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings,
+                                   dilations, groups, ctx.op().Input("Output"));
+
+    const std::string key_conv_pd = key + "@conv_pd";
+
     // create mkldnn memory from input tensors (input/weights/output_grad)
     auto user_src_memory = memory(
         {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 33fec2c107..a8f93e6848 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -222,15 +222,16 @@ class MKLDNNHandler {
 
   static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
                              const std::string& suffix) {
-    auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
-      std::string dstr = "";
-      for (size_t i = 0; i < operand_dims.size(); ++i) {
-        dstr += std::to_string(operand_dims[i]) + "-";
-      }
-      return dstr;
-    };
-
     return dims2str(operand_dims) + suffix;
+  };
+
+ protected:
+  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
   }
 
  protected:

From 2cc6ca43a0b37b5ddc822e1ea495e6a8f3e2a95b Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 11 Jul 2018 16:02:45 +0800
Subject: [PATCH 38/45] Add framework_proto to device context deps

---
 paddle/fluid/platform/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 20037d0764..e0d7937ae2 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -46,7 +46,7 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
-    place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 cc_test(init_test SRCS init_test.cc DEPS device_context)

From 0cefb9461f596cacb76c7659aef3a55f200a1f6d Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Wed, 11 Jul 2018 16:46:55 +0800
Subject: [PATCH 39/45] add topological sortting (#12059)

---
 .../inference/analysis/data_flow_graph.cc     | 86 ++++++++++++++++++-
 .../inference/analysis/data_flow_graph.h      | 36 ++++++++
 .../analysis/data_flow_graph_tester.cc        | 69 ++++++++++++++-
 3 files changed, 188 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index d09bf3ed16..bd24e8a7d9 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -90,6 +90,20 @@ std::string DataFlowGraph::DotString() const {
   return dot.Build();
 }
 
+std::string DataFlowGraph::HumanReadableInfo(bool show_values,
+                                             bool show_functions) const {
+  std::stringstream values, functions;
+  for (auto &n : nodes.nodes()) {
+    if (show_values && n->IsValue()) {
+      values << n->repr() << "\n";
+    }
+    if (show_functions && n->IsFunction()) {
+      functions << n->repr() << "\n";
+    }
+  }
+  return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str();
+}
+
 //
 // NodesBFSIterator
 //
@@ -146,7 +160,7 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
   if ((!queue_.empty()) && (!other.queue_.empty())) {
     return queue_.front() == other.queue_.front() &&
            visited_.size() == other.visited_.size();  // here need to check the
-                                                      // equality of queue and
+    // equality of queue and
     // visited. Just a light but week implementation.
   }
   return false;
@@ -208,6 +222,76 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
   return stack_.top();
 }
 
+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      inlink_visited.clear();
+
+      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) { return visited.count(x); });
+
+      if (inlink_visited.size() == p->inlinks.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outlinks) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+
+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator
+    &GraphTraits<DataFlowGraph>::NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator &
+GraphTraits<DataFlowGraph>::NodesTSIterator::operator=(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+
+bool GraphTraits<DataFlowGraph>::NodesTSIterator::operator==(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+
+Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index a4fefc83e0..5dd914d197 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -48,6 +48,9 @@ struct DataFlowGraph {
   // Output a DOT graph file for debug.
   std::string DotString() const;
 
+  std::string HumanReadableInfo(bool show_values = true,
+                                bool show_functions = true) const;
+
  private:
   // Remove duplicate edges and so on.
   void Clean();
@@ -107,6 +110,32 @@ struct GraphTraits<DataFlowGraph> {
     std::unordered_set<Node *> visited_;
   };
 
+  // Topological sorting iterator on nodes.
+  struct NodesTSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesTSIterator() = default;
+    explicit NodesTSIterator(const std::vector<Node *> &source);
+    NodesTSIterator(NodesTSIterator &&other)
+        : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+      other.cursor_ = 0;
+    }
+    NodesTSIterator(const NodesTSIterator &other);
+
+    Node &operator*();
+    NodesTSIterator &operator++();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesTSIterator &operator=(const NodesTSIterator &other);
+    bool operator==(const NodesTSIterator &other);
+    bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+    Node *operator->();
+
+   private:
+    std::vector<Node *> sorted_;
+    int cursor_{0};
+  };
+
   explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
 
   // default use BFS to visit the nodes.
@@ -119,17 +148,24 @@ struct GraphTraits<DataFlowGraph> {
   iterator_range<NodesDFSIterator> nodes_in_DFS() {
     return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
   }
+  iterator_range<NodesTSIterator> nodes_in_TS() {
+    return iterator_range<NodesTSIterator>(nodes_ts_begin(), nodes_ts_end());
+  }
 
  private:
   NodesBFSIterator nodes_bfs_begin() {
     return NodesBFSIterator(graph_->inputs);
   }
   NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
+
   NodesDFSIterator nodes_dfs_begin() {
     return NodesDFSIterator(graph_->inputs);
   }
   NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
 
+  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); }
+  NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
+
  private:
   DataFlowGraph *graph_;
 };
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 9d7cceeb65..7912f8d7f1 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -24,11 +24,11 @@ TEST(DataFlowGraph, BFS) {
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
 
-  for (auto* in : dfg.inputs) {
+  for (auto *in : dfg.inputs) {
     LOG(INFO) << "inputs: " << in->name() << " "
               << static_cast<int>(in->type());
   }
-  for (auto* out : dfg.outputs) {
+  for (auto *out : dfg.outputs) {
     LOG(INFO) << "outputs: " << out->name() << " "
               << static_cast<int>(out->type());
   }
@@ -57,6 +57,71 @@ TEST(DataFlowGraph, DFS) {
   ASSERT_EQ(count, dfg.nodes.size());
 }
 
+// Topological sorting.
+/*
+ * Graph topology
+ * inputs: 0, 1, 2
+ * 0 -> 4
+ * 0 -> 5
+ * 1 -> 6
+ * 2 -> 7
+ * 4 -> 5
+ * 4 -> 7
+ * 4 -> 3
+ * 7 -> 3
+ */
+TEST(DataFlowGraph, TS) {
+  DataFlowGraph graph;
+
+  for (int i = 0; i < 8; i++) {
+    auto *node = graph.nodes.Create(Node::Type::kValue);
+    node->SetName("node-" + std::to_string(i));
+  }
+
+  auto add_link = [&](int i, int j) {
+    Node *source = graph.nodes.GetMutable(i);
+    Node *target = graph.nodes.GetMutable(j);
+    target->inlinks.push_back(source);
+    source->outlinks.push_back(target);
+  };
+
+  graph.inputs.push_back(graph.nodes.GetMutable(0));
+  graph.inputs.push_back(graph.nodes.GetMutable(1));
+  graph.inputs.push_back(graph.nodes.GetMutable(2));
+
+  add_link(0, 4);
+  add_link(0, 5);
+  add_link(1, 6);
+  add_link(2, 7);
+  add_link(4, 5);
+  add_link(4, 7);
+  add_link(4, 3);
+  add_link(7, 3);
+
+  auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS();
+  std::vector<int> sorted_ids;
+  for (auto it = its.begin(); it != its.end(); ++it) {
+    LOG(INFO) << it->name();
+    sorted_ids.push_back(it->id());
+  }
+
+  // Assert a occurs prior to b in the sorted_ids.
+  auto assert_positive_sequence_pair = [&](int a, int b) {
+    auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a);
+    auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b);
+    ASSERT_LT(a_offset, b_offset);
+  };
+
+  assert_positive_sequence_pair(2, 7);
+  assert_positive_sequence_pair(7, 3);
+  assert_positive_sequence_pair(4, 3);
+  assert_positive_sequence_pair(0, 4);
+  assert_positive_sequence_pair(0, 5);
+  assert_positive_sequence_pair(1, 6);
+  assert_positive_sequence_pair(4, 5);
+  assert_positive_sequence_pair(4, 7);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

From cfa4479b063a1be79d7ff1924767507e31e7ca1d Mon Sep 17 00:00:00 2001
From: Noplz <yuan.gao.gavin@gmail.com>
Date: Wed, 11 Jul 2018 16:56:33 +0800
Subject: [PATCH 40/45] fix warning

---
 paddle/fluid/operators/detection/rpn_target_assign_op.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 3b0c9b2886..9a1643d5b3 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -86,8 +86,9 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
                          std::minstd_rand engine,
                          std::vector<int>* inds) const {
     std::uniform_real_distribution<float> uniform(0, 1);
-    if (inds->size() > num) {
-      for (int i = num; i < inds->size(); ++i) {
+    const int64_t size = static_cast<int64_t>(inds->size());
+    if (size > num) {
+      for (int64_t i = num; i < size; ++i) {
         int rng_ind = std::floor(uniform(engine) * i);
         if (rng_ind < num)
           std::iter_swap(inds->begin() + rng_ind + offset,

From 29145e1e31173eec6ce7a55d8bd294e46cf628d7 Mon Sep 17 00:00:00 2001
From: lemon34 <540476129@qq.com>
Date: Wed, 11 Jul 2018 17:17:13 +0800
Subject: [PATCH 41/45] change im2sequence for ctc batch inference (#11696)

* change im2sequence for ctc batch inference

* Update im2sequence_op.cc

* change im2sequence for ctc batch inference

* update

* change PR by comment

* fix ocr test error

* fix test_im2sequence

* modify the old name to standard name

* fix test_layers failed
---
 paddle/fluid/operators/im2sequence_op.cc      |  25 ++-
 paddle/fluid/operators/im2sequence_op.h       | 125 ++++++++----
 paddle/fluid/operators/math/im2col.cc         |  26 ---
 paddle/fluid/operators/math/im2col.cu         |  30 ---
 python/paddle/fluid/layers/nn.py              |  52 +++--
 .../tests/unittests/test_im2sequence_op.py    | 180 ++++++++++++++----
 .../fluid/tests/unittests/test_layers.py      |   8 +-
 7 files changed, 302 insertions(+), 144 deletions(-)

diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 0669661d22..c8c7f36536 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -28,20 +29,19 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
                    "Input(X) of Im2SequenceOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of Im2SequenceOp op should not be null.");
-
     auto in_dim = ctx->GetInputDim("X");
+
     PADDLE_ENFORCE_EQ(in_dim.size(), 4,
                       "Input(X) format must be 4D tensor, eg., NCHW.");
-
-    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
-    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-
     int batch_size = in_dim[0];
     int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
 
+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
     int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
                                          paddings[2], strides[0]);
     int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
@@ -61,6 +61,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
              "C: channels"
              "H: height"
              "W: width");
+    AddInput("Y",
+             "(Tensor) The input tensor of image real size(H, W)."
+             "2-D with shape [batchsize, 2]")
+        .AsDispensable();
     AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
     AddAttr<std::vector<int>>("kernels",
                               "(vector<int>), the "
@@ -73,6 +77,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
                               "(vector<int> default:{0, 0, 0, 0}), the "
                               "paddings(up_pad, left_pad, down_pad, right_pad)")
         .SetDefault({0, 0, 0, 0});
+    AddAttr<std::vector<int>>("out_stride",
+                              "the attribute is valid only when input(Y)"
+                              "is not NULL.this attribute represents the"
+                              "scaling of the pic through the CNN"
+                              "(vector<int> dedault:{1,1}),the out_stride"
+                              " (out_stride_height, out_stride_width)")
+        .SetDefault({1, 1});
     AddComment(R"DOC(
 This op uses kernels to scan images and converts these images to sequences.
 After expanding, The number of time steps are output_height * output_width
@@ -123,7 +134,7 @@ output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
                [ 7.  1.  7.  9.  2.  1.  3.  5.]
                [ 5.  7.  2.  4.  1.  3.  9.  0.]
                [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-output.dims = {8, 9}
+output.dims = {8, 8}
 output.lod = [[0, 4, 8]]
 
 )DOC");
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index d792c68f78..5bfb91db18 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
@@ -39,50 +40,106 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     const Tensor* in = ctx.Input<Tensor>("X");
     LoDTensor* out = ctx.Output<LoDTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
-    // being available for python API
-    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
-    //                  "Input(X) layout must be NCHW");
     auto in_dim = in->dims();
     int batch_size = in_dim[0];
     int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
-
     auto kernels = ctx.Attr<std::vector<int>>("kernels");
     auto strides = ctx.Attr<std::vector<int>>("strides");
     auto paddings = ctx.Attr<std::vector<int>>("paddings");
-    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                         paddings[2], strides[0]);
-    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                        paddings[3], strides[1]);
-
-    const std::vector<int> dilations({1, 1});
-
-    auto out_dims = out->dims();
-    out->Resize({batch_size, out->numel() / batch_size});
-    for (int i = 0; i < batch_size; i++) {
-      const Tensor src =
-          in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      Tensor dst = out->Slice(i, i + 1).Resize(
-          {output_height, output_width, img_channels, kernels[0], kernels[1]});
-
-      math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      f(dev_ctx, src, dilations, strides, paddings, &dst);
-    }
-    out->Resize(out_dims);
-
-    // set lod information
-    // TODO(wanghaoshuang): Move this to InferShape
-    framework::LoD lod(1);
-    lod[0].reserve(batch_size + 1);
-    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
+    if (ctx.HasInput("Y") && batch_size > 1) {
+      const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
+      auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
+      Tensor cpu_shape_tensor;
+      TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
+      std::vector<int> imgreal_h;
+      std::vector<int> imgreal_w;
+      std::vector<int> output_height;
+      std::vector<int> output_width;
+      int result = 0;
+      for (int i = 0; i < batch_size; i++) {
+        int tmp_real_h = static_cast<int>((cpu_shape_tensor.data<T>())[2 * i]);
+        int tmp_real_w =
+            static_cast<int>((cpu_shape_tensor.data<T>())[2 * i + 1]);
+        if (tmp_real_h % out_stride[0] == 0) {
+          tmp_real_h = tmp_real_h / out_stride[0];
+        } else {
+          tmp_real_h = tmp_real_h / out_stride[0] + 1;
+        }
+        if (tmp_real_w % out_stride[1] == 0) {
+          tmp_real_w = tmp_real_w / out_stride[1];
+        } else {
+          tmp_real_w = tmp_real_w / out_stride[1] + 1;
+        }
+        imgreal_h.push_back(tmp_real_h);
+        imgreal_w.push_back(tmp_real_w);
+        output_height.push_back(Im2SeqOutputSize(
+            imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
+        output_width.push_back(Im2SeqOutputSize(
+            imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
+        result += output_height[i] * output_width[i];
+      }
+
+      out->mutable_data<T>({result, img_channels * kernels[0] * kernels[1]},
+                           ctx.GetPlace());
+
+      const std::vector<int> dilations({1, 1});
+      int offset_out = 0;
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst = out->Slice(offset_out,
+                                offset_out + output_height[i] * output_width[i])
+                         .Resize({output_height[i], output_width[i],
+                                  img_channels, kernels[0], kernels[1]});
+        offset_out += output_height[i] * output_width[i];
+
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
+      lod[0].push_back(offset);
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height[i] * output_width[i];
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
+    } else {
+      out->mutable_data<T>(ctx.GetPlace());
+      int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
+                                           paddings[2], strides[0]);
+      int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
+                                          paddings[3], strides[1]);
+
+      const std::vector<int> dilations({1, 1});
+      auto out_dims = out->dims();
+      out->Resize({batch_size, out->numel() / batch_size});
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst =
+            out->Slice(i, i + 1).Resize({output_height, output_width,
+                                         img_channels, kernels[0], kernels[1]});
+
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      out->Resize(out_dims);
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
       lod[0].push_back(offset);
-      offset += output_height * output_width;
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height * output_width;
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
     }
-    out->set_lod(lod);
   }
 };
 
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 336d6febc2..a50b9ace39 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -43,21 +43,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int col_height = col->dims()[3];
     int col_width = col->dims()[4];
 
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-
     int channels_col = im_channels * filter_height * filter_width;
 
     const T* im_data = im.data<T>();
@@ -178,17 +163,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int col_height = col->dims()[0];
     int col_width = col->dims()[1];
 
-    PADDLE_ENFORCE_EQ(
-        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
-
     const T* im_data = im.data<T>();
     T* col_data = col->data<T>();
 
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index eecb233d22..4897767f4d 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -77,21 +77,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int col_height = col->dims()[3];
     int col_width = col->dims()[4];
 
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
     int num_outputs = im_channels * col_height * col_width;
     int blocks = (num_outputs + 1024 - 1) / 1024;
     int block_x = 512;
@@ -274,21 +259,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int col_height = col->dims()[0];
     int col_width = col->dims()[1];
 
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
     int block_dim_x = 0;
     int block_dim_y = 0;
     if (filter_height <= 4 && filter_width <= 4) {
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bcf520d5a4..07b806f544 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1,4 +1,18 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#   Copyright (c ) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -3900,7 +3914,13 @@ def transpose(x, perm, name=None):
     return out
 
 
-def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
+def im2sequence(input,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                input_image_size=None,
+                out_stride=1,
+                name=None):
     """
     Extracts image patches from the input tensor to form a tensor of shape
     {input.batch_size * output_height * output_width, filter_size_H *
@@ -3937,6 +3957,15 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
             padding_up = padding_down = padding_left = padding_right = padding
             Default: padding = 0.
 
+        input_image_size(Variable): the input contains image real size.It's dim
+            is [batchsize, 2]. It is dispensable.It is just for batch inference.
+
+        out_stride(int|tuple): The scaling of image through CNN. It is
+            dispensable. It is valid only when input_image_size is not null.
+            If out_stride is tuple,  it must contain two intergers,
+            (out_stride_H, out_stride_W). Otherwise,
+            the out_stride_H = out_stride_W = out_stride.
+
         name (int): The name of this layer. It is optional.
 
     Returns:
@@ -3987,7 +4016,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
                            [ 5.  7.  2.  4.  1.  3.  9.  0.]
                            [ 7.  9.  4.  8.  3.  5.  0.  8.]]
 
-            output.dims = {8, 9}
+            output.dims = {8, 8}
 
             output.lod = [[4, 4]]
 
@@ -4009,18 +4038,17 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
     if len(padding) == 2:
         padding.append(padding[0])
         padding.append(padding[1])
-
+    inputs = {"X": input}
+    attrs = {"kernels": filter_size, "strides": stride, "padding": padding}
+    if input_image_size:
+        if isinstance(out_stride, int):
+            out_stride = [out_stride, out_stride]
+        inputs["Y"] = input_image_size
+        attrs["out_stride"] = out_stride
     helper = LayerHelper('im2sequence', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(
-        type='im2sequence',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'kernels': filter_size,
-            'strides': stride,
-            'paddings': padding,
-        })
+        type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
     return out
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
index 4946475f11..13bc576874 100644
--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
@@ -16,23 +16,48 @@ import numpy as np
 from op_test import OpTest
 
 
-def get_output_shape(attrs, in_shape):
+def get_output_shape(attrs, in_shape, img_real_size):
+    batchsize = in_shape[0]
     img_height = in_shape[2]
     img_width = in_shape[3]
+    paddings = np.array(attrs['paddings']).astype("int32")
+    kernels = np.array(attrs['kernels']).astype("int32")
+    strides = np.array(attrs['strides']).astype("int32")
+    output_height = np.zeros((1, batchsize)).astype("int32")
+    output_width = np.zeros((1, batchsize)).astype("int32")
+    if len(img_real_size):
+        out_stride = np.array(attrs['out_stride']).astype("int32")
+        imgreal_h = 0
+        imgreal_w = 0
+        for index in range(batchsize):
+            if img_real_size[index, 0] % out_stride[0] == 0:
+                imgreal_h = img_real_size[index, 0] / out_stride[0]
+            else:
+                imgreal_h = img_real_size[index, 0] / out_stride[0] + 1
+            if img_real_size[index, 0] % out_stride[1] == 0:
+                imgreal_w = img_real_size[index, 1] / out_stride[1]
+            else:
+                imgreal_w = img_real_size[index, 0] / out_stride[1] + 1
+            output_height[0,index] = \
+              1 +  \
+              (imgreal_h + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+                  strides[0]
 
-    paddings = attrs['paddings']
-    kernels = attrs['kernels']
-    strides = attrs['strides']
+            output_width[0,index] = \
+              1 + \
+              (imgreal_w + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+                  strides[1]
+    else:
+        for index in range(batchsize):
+            output_height[0,index] = \
+              1 +  \
+              (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+                  strides[0]
 
-    output_height = \
-      1 +  \
-      (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
-          strides[0]
-
-    output_width = \
-      1 + \
-      (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
-          strides[1]
+            output_width[0,index] = \
+              1 + \
+              (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+                  strides[1]
 
     return output_height, output_width
 
@@ -75,22 +100,25 @@ def im2col(attrs, im, col):
                                     im_row_offset][im_col_offset]
 
 
-def Im2Sequence(inputs, attrs):
-    output_height, output_width = get_output_shape(attrs, inputs.shape)
+def Im2Sequence(inputs, img_real_size, attrs):
+    output_height, output_width = get_output_shape(attrs, inputs.shape,
+                                                   img_real_size)
     img_channels = inputs.shape[1]
     batch_size = inputs.shape[0]
-    out = np.zeros([
-        batch_size, output_height, output_width, img_channels,
-        attrs['kernels'][0], attrs['kernels'][1]
-    ]).astype("float32")
-
-    for i in range(len(inputs)):
-        im2col(attrs, inputs[i], out[i])
-
-    out = out.reshape([
-        batch_size * output_height * output_width,
-        img_channels * attrs['kernels'][0] * attrs['kernels'][1]
-    ])
+    out = []
+    for index in range(batch_size):
+        tmp = np.zeros([
+            output_height[0, index], output_width[0, index], img_channels,
+            attrs['kernels'][0], attrs['kernels'][1]
+        ]).astype("float32")
+        out.append(tmp)
+    for index in range(len(inputs)):
+        im2col(attrs, inputs[index], out[index])
+        out[index] = out[index].reshape([
+            output_height[0, index] * output_width[0, index],
+            img_channels * attrs['kernels'][0] * attrs['kernels'][1]
+        ])
+    out = np.concatenate(out, axis=0)
     return out
 
 
@@ -103,7 +131,7 @@ class TestBlockExpandOp(OpTest):
         self.attrs = {
             'kernels': [2, 2],
             'strides': [1, 1],
-            'paddings': [1, 1, 1, 1]
+            'paddings': [1, 1, 1, 1],
         }
 
     def setUp(self):
@@ -113,7 +141,8 @@ class TestBlockExpandOp(OpTest):
             self.batch_size, self.img_channels, self.img_height, self.img_width
         ]).astype("float32")
 
-        out = Im2Sequence(x, self.attrs)
+        real_size = np.array([]).astype("float32")
+        out = Im2Sequence(x, real_size, self.attrs)
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
@@ -133,20 +162,20 @@ class TestBlockExpandOpCase2(TestBlockExpandOp):
         self.attrs = {
             'kernels': [2, 1],
             'strides': [2, 1],
-            'paddings': [2, 1, 2, 1]
+            'paddings': [2, 1, 2, 1],
         }
 
 
 class TestBlockExpandOpCase3(TestBlockExpandOp):
     def config(self):
-        self.batch_size = 3
+        self.batch_size = 2
         self.img_channels = 1
         self.img_height = 4
         self.img_width = 5
         self.attrs = {
             'kernels': [2, 1],
             'strides': [2, 1],
-            'paddings': [2, 0, 2, 0]
+            'paddings': [2, 0, 2, 0],
         }
 
 
@@ -159,9 +188,94 @@ class TestBlockExpandOpCase4(TestBlockExpandOp):
         self.attrs = {
             'kernels': [2, 2],
             'strides': [1, 1],
-            'paddings': [0, 0, 0, 0]
+            'paddings': [0, 0, 0, 0],
+        }
+
+
+class TestBlockExpandOpCase5(OpTest):
+    def config(self):
+        self.batch_size = 1
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 1, 2, 1],
+            'out_stride': [2, 2],
+        }
+
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[8, 10], [5, 8]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}  #l ??
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBlockExpandOpCase6(OpTest):
+    def config(self):
+        self.batch_size = 3
+        self.img_channels = 1
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [1, 1],
+            'paddings': [0, 0, 0, 0],
+            'out_stride': [1, 1],
+        }
+
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[8, 10], [5, 8], [5, 8]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}  #l ??
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBlockExpandOpCase7(OpTest):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 2
+        self.img_height = 3
+        self.img_width = 3
+        self.attrs = {
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [1, 0, 1, 0],
+            'out_stride': [2, 2],
         }
 
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[6, 6], [4, 4]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
 
 if __name__ == '__main__':
     unittest.main()
+#set shiftwidth=4 set expandtab set tabstop=4
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 842d34c07e..31ae4e7d8c 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -251,12 +251,16 @@ class TestBook(unittest.TestCase):
         print(str(program))
 
     def test_im2sequence(self):
-        print("test_im2sequence")
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
+            y = layers.data(name='y', shape=[], dtype='float32')
             output = layers.im2sequence(
-                input=x, stride=[1, 1], filter_size=[2, 2])
+                input=x,
+                input_image_size=y,
+                stride=[1, 1],
+                filter_size=[2, 2],
+                out_stride=[1, 1])
             self.assertIsNotNone(output)
         print(str(program))
 

From 335e4b9d52a978aa6a7a845a24b044ab0927c91e Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 11 Jul 2018 20:17:10 +0800
Subject: [PATCH 42/45] fix a dead lock bug

---
 paddle/fluid/framework/details/threaded_ssa_graph_executor.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 8a8c3a5938..07097c7e75 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -100,11 +100,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
-      std::lock_guard<std::mutex> l(exception_mu_);
+      std::unique_lock<std::mutex> l(exception_mu_);
       if (exception_) {
+        l.unlock();
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
+        l.lock();
         std::exception *exp = exception_.get();
         if (dynamic_cast<platform::EOFException *>(exp)) {
           auto e = *static_cast<platform::EOFException *>(exp);

From 010c0ab0a1c83259e09b0b0e929ccbc76a7d3177 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 11 Jul 2018 22:01:41 +0800
Subject: [PATCH 43/45] add install_name_tool check on macos

---
 python/CMakeLists.txt | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index bec161a571..2590081150 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -92,8 +92,15 @@ install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
 )
 
-find_program(PATCHELF_EXECUTABLE patchelf)
-if(NOT PATCHELF_EXECUTABLE)
-  message(FATAL_ERROR "patchelf not found, please install it.\n"
-          "For Ubuntu, the command is: apt-get install -y patchelf.")
-endif()
+if(APPLE)
+  find_program(INSTALL_NAME_TOOL_EXECUTABLE install_name_tool)
+  if(NOT INSTALL_NAME_TOOL_EXECUTABLE)
+    message(FATAL_ERROR "install_name_tool not found, please check.\n")
+  endif()
+else(APPLE)
+  find_program(PATCHELF_EXECUTABLE patchelf)
+  if(NOT PATCHELF_EXECUTABLE)
+    message(FATAL_ERROR "patchelf not found, please install it.\n"
+            "For Ubuntu, the command is: apt-get install -y patchelf.")
+  endif()
+endif(APPLE)

From 031a07f9c0419d46f5aaec4c86cf72e69f24f49f Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 12 Jul 2018 10:55:01 +0800
Subject: [PATCH 44/45] add rpc_deadline in bootstrapper

---
 python/paddle/fluid/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 3034c1a087..740de13007 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -119,7 +119,7 @@ def __bootstrap__():
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem'
+        'init_allocated_mem', 'rpc_deadline'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += [

From bbd53131f04435a50e374756c89cc549f2789b63 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 12 Jul 2018 13:54:19 +0800
Subject: [PATCH 45/45] update

---
 paddle/fluid/pybind/pybind.cc   | 9 +++++++++
 python/paddle/fluid/__init__.py | 5 ++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0c523b6f17..96ab5d457b 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -66,6 +66,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }
 
+bool IsCompiledWithDIST() {
+#ifdef PADDLE_WITH_DIST
+  return true;
+#else
+  return false;
+#endif
+}
+
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of PaddlePaddle");
 
@@ -508,6 +516,7 @@ All parameter, weight, gradient are variables in Paddle.
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_dist", IsCompiledWithDIST);
 #ifdef PADDLE_WITH_CUDA
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
     // Only GPUs with Compute Capability >= 53 support float16
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 740de13007..ba562d3ba9 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -119,8 +119,11 @@ def __bootstrap__():
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'rpc_deadline'
+        'init_allocated_mem'
     ]
+    if core.is_compiled_with_dist():
+        read_env_flags.append('rpc_deadline')
+
     if core.is_compiled_with_cuda():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'