From 03ccb9a461db7650fd1dc749f2f61a4df253bf31 Mon Sep 17 00:00:00 2001
From: Yihua Xu <yihuax.xu@intel.com>
Date: Thu, 15 Nov 2018 16:07:16 +0800
Subject: [PATCH 01/80] Optimize the stack operator

---
 paddle/fluid/operators/stack_op.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index d236c5b943..f1692ae956 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -147,16 +147,23 @@ class StackKernel : public framework::OpKernel<T> {
     auto &dim = x[0]->dims();
     for (auto i = 0; i < axis; ++i) pre *= dim[i];
     for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-    int total_num = pre * n * post;
 
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
 #ifdef __NVCC__
     thrust::device_vector<const T *> device_x_vec(x_datas);
     auto x_data_arr = device_x_vec.data().get();
 #else
     auto x_data_arr = x_datas.data();
 #endif
-    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
+    size_t x_offset = 0;
+    size_t y_offset = 0;
+    for (int i = 0; i < pre; i++) {
+      for (int j = 0; j < n; j++) {
+        std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
+                    post * sizeof(T));
+        y_offset += post;
+      }
+      x_offset += post;
+    }
 #ifdef __NVCC__
     // Wait() must be called because device_x_vec may be destructed before
     // kernel ends

From b969116988a793718c9cce0bbe98bf84c0215412 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Fri, 16 Nov 2018 07:49:36 +0000
Subject: [PATCH 02/80] fxi avg pool trt bug and fix cpplint

---
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +-
 .../inference/tensorrt/convert/pool2d_op.cc   | 146 +++++++++++-------
 .../tensorrt/convert/test_pool2d_op.cc        |  16 +-
 .../inference/tensorrt/plugin/CMakeLists.txt  |   3 +-
 .../tensorrt/plugin/avg_pool_op_plugin.cu     |  62 ++++++++
 .../tensorrt/plugin/avg_pool_op_plugin.h      | 109 +++++++++++++
 .../tensorrt/plugin/split_op_plugin.cu        |   7 +-
 .../tensorrt/plugin/split_op_plugin.h         |  27 ++--
 paddle/fluid/operators/math/pooling.cu        |  36 +++++
 paddle/fluid/operators/math/pooling.h         |  12 ++
 10 files changed, 339 insertions(+), 81 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h

diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index ed4c398cee..396ba510c8 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -18,7 +18,7 @@ nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
 nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op  tensorrt_plugin SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 4885002084..db8e7f8438 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -13,25 +13,57 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+void DealCeilMode(const nvinfer1::Dims &input_shape, std::vector<int> ksize,
+                  std::vector<int> strides, std::vector<int> paddings,
+                  nvinfer1::DimsHW *pre_pad, nvinfer1::DimsHW *post_pad,
+                  int input_dims) {
+  int input_height = input_shape.d[input_dims - 2];
+  int input_width = input_shape.d[input_dims - 1];
+  int floor_h_output_size =
+      (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+  int ceil_h_output_size =
+      (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
+          strides[0] +
+      1;
+
+  int floor_w_output_size =
+      (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+  int ceil_w_output_size =
+      (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) / strides[1] +
+      1;
+  if (floor_h_output_size != ceil_h_output_size) {
+    post_pad->h() = strides[0] - 1;
+  }
+
+  if (floor_w_output_size != ceil_w_output_size) {
+    post_pad->w() = strides[1] - 1;
+  }
+}
+
 /*
  * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
  */
 class Pool2dOpConverter : public OpConverter {
  public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3)
+  void operator()(const framework::proto::OpDesc &op,
+                  const framework::Scope &scope, bool test_mode) override {
+    VLOG(40)
         << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    nvinfer1::Dims input_shape = input1->getDimensions();
+    int input_dims = input_shape.nbDims;
+
+    PADDLE_ENFORCE_EQ(input_dims, 3UL);
 
     bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
     std::string pool_type =
@@ -44,23 +76,6 @@ class Pool2dOpConverter : public OpConverter {
         boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
     bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));
 
-    nvinfer1::Dims input_shape = input1->getDimensions();
-    int nbDims = input_shape.nbDims;
-    nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
-    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
-    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
-
-    if (global_pooling == true) {
-      nv_ksize.d[0] = input_shape.d[nbDims - 2];
-      nv_ksize.d[1] = input_shape.d[nbDims - 1];
-      nv_strides.h() = 1;
-      nv_strides.w() = 1;
-      nv_paddings.h() = 0;
-      nv_paddings.w() = 0;
-    }
-
-    PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
-
     nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
     if (pool_type == "max") {
       nv_pool_type = nvinfer1::PoolingType::kMAX;
@@ -70,48 +85,71 @@ class Pool2dOpConverter : public OpConverter {
       PADDLE_THROW("TensorRT unsupported pooling type!");
     }
 
-    if (ceil_mode) {
-      nvinfer1::DimsHW pre_pad(0, 0);
-      nvinfer1::DimsHW post_pad(0, 0);
-      int input_height = input_shape.d[nbDims - 2];
-      int input_width = input_shape.d[nbDims - 1];
-      int floor_h_output_size =
-          (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-      int ceil_h_output_size =
-          (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
-              strides[0] +
-          1;
-
-      int floor_w_output_size =
-          (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-      int ceil_w_output_size =
-          (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
-              strides[1] +
-          1;
-      if (floor_h_output_size != ceil_h_output_size) {
-        post_pad.h() = strides[0] - 1;
+    nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    nvinfer1::ILayer *layer = nullptr;
+
+    if (global_pooling == true) {
+      nv_ksize.d[0] = input_shape.d[input_dims - 2];
+      nv_ksize.d[1] = input_shape.d[input_dims - 1];
+      auto *layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Pooling, *const_cast<nvinfer1::ITensor *>(input1),
+          nv_pool_type, nv_ksize);
+      PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
+      auto output_name = op_desc.Output("Out")[0];
+      layer->setName(("pool2d (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+      if (test_mode ||
+          output_name == "patch_6_pool1.avg_pool.output.1.tmp_0702") {
+        engine_->DeclareOutput(output_name);
       }
+      return;
+    }
 
-      if (floor_w_output_size != ceil_w_output_size) {
-        post_pad.w() = strides[1] - 1;
+    if (pool_type == "max") {
+      nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]);
+      nvinfer1::DimsHW post_pad(paddings[0], paddings[1]);
+      if (ceil_mode) {
+        // If ceil mode is true, we will pad the appropriate size to the input.
+        DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
+                     input_dims);
+        auto *pad_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Padding, *const_cast<nvinfer1::ITensor *>(input1), pre_pad,
+            post_pad);
+        PADDLE_ENFORCE_NOT_NULL(
+            pad_layer, "pad layer in poolOp converter could not be created.");
+        input1 = pad_layer->getOutput(0);
+      }
+      auto *pool_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Pooling, *const_cast<nvinfer1::ITensor *>(input1),
+          nv_pool_type, nv_ksize);
+      PADDLE_ENFORCE_NOT_NULL(pool_layer, "pool layer could not be created.");
+      pool_layer->setStride(nv_strides);
+      pool_layer->setPadding(nv_paddings);
+      layer = pool_layer;
+    } else {
+      // Average pooling needs to exclude the padding pixels from the average
+      // mean.
+      // It is not supported well by TRT, we use a plugin here.
+      std::vector<int> input_shape_v;
+      for (int i = 0; i < input_dims; i++) {
+        input_shape_v.push_back(input_shape.d[i]);
       }
-      auto* layer = TRT_ENGINE_ADD_LAYER(
-          engine_, Padding, *const_cast<nvinfer1::ITensor*>(input1), pre_pad,
-          post_pad);
-      input1 = layer->getOutput(0);
+      AvgPoolPlugin *plugin =
+          new AvgPoolPlugin(ceil_mode, ksize, strides, paddings, input_shape_v);
+      auto *avg_pool_layer = engine_->AddPlugin(&input1, 1, plugin);
+      layer = avg_pool_layer;
     }
-    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
-                                       *const_cast<nvinfer1::ITensor*>(input1),
-                                       nv_pool_type, nv_ksize);
-    PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
-    layer->setStride(nv_strides);
-    layer->setPadding(nv_paddings);
 
     auto output_name = op_desc.Output("Out")[0];
     layer->setName(("pool2d (Output: " + output_name + ")").c_str());
     layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {
+    if (test_mode ||
+        output_name == "patch_6_pool1.avg_pool.output.1.tmp_0702") {
       engine_->DeclareOutput(output_name);
     }
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index ee597f8465..bded833505 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -20,20 +20,21 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void test_pool2d(bool global_pooling, bool ceil_mode) {
+void test_pool2d(bool global_pooling, bool ceil_mode,
+                 std::string pool_type = "max") {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
   TRTConvertValidation validator(5, parameters, scope, 1 << 15);
 
   // The ITensor's Dims should not contain the batch size.
   // So, the ITensor's Dims of input and output should be C * H * W.
-  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 13, 14));
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 6, 7));
   if (global_pooling)
     validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
   else if (ceil_mode)
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 7));
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 3, 4));
   else
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 6));
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 3, 3));
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -41,10 +42,10 @@ void test_pool2d(bool global_pooling, bool ceil_mode) {
   desc.SetInput("X", {"pool2d-X"});
   desc.SetOutput("Out", {"pool2d-Out"});
 
-  std::vector<int> ksize({3, 3});
+  std::vector<int> ksize({2, 2});
   std::vector<int> strides({2, 2});
   std::vector<int> paddings({0, 0});
-  std::string pooling_t = "max";
+  std::string pooling_t = pool_type;
 
   desc.SetAttr("pooling_type", pooling_t);
   desc.SetAttr("ksize", ksize);
@@ -63,7 +64,8 @@ void test_pool2d(bool global_pooling, bool ceil_mode) {
 TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
 TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }
 
-TEST(Pool2dOpConverter, test_ceil_mode) { test_pool2d(false, true); }
+TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); }
+TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
 
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 71b7a55161..c246f8341f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1 +1,2 @@
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce)
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu
+avg_pool_op_plugin.cu DEPS enforce pooling)
diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
new file mode 100644
index 0000000000..e440f1c313
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
@@ -0,0 +1,62 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
+#include "paddle/fluid/operators/math/pooling.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+nvinfer1::Dims AvgPoolPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* inputDims, int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index == 0);
+  assert(inputDims[0].nbDims == 3);
+  nvinfer1::Dims const& input_dims = inputDims[0];
+
+  nvinfer1::Dims output_dims = input_dims;
+
+  output_dims.d[1] = output_shape_[1];
+  output_dims.d[2] = output_shape_[2];
+  return output_dims;
+}
+
+int AvgPoolPlugin::enqueue(int batchSize, const void* const* inputs,
+                           void** outputs, void* workspace,
+                           cudaStream_t stream) {
+  auto const& input_dims = this->getInputDims(0);
+  int input_size = 0;
+  float const* idata = reinterpret_cast<float const*>(inputs[0]);
+  float** odatas = reinterpret_cast<float**>(outputs);
+
+  paddle::operators::math::AvgPool<float> pool_process;
+  paddle::operators::math::Pool2dDirectCUDAFunctor<
+      paddle::operators::math::AvgPool<float>, float>
+      pool2d_forward;
+
+  std::vector<int> input_shape = input_shape_;
+  std::vector<int> output_shape = output_shape_;
+  input_shape.insert(input_shape.begin(), batchSize);
+  output_shape.insert(output_shape.begin(), batchSize);
+
+  pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_,
+                 pool_process, true, odatas[0], stream);
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
new file mode 100644
index 0000000000..e83fd38858
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cassert>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class AvgPoolPlugin : public PluginTensorRT {
+ private:
+  bool ceil_mode_;
+  std::vector<int> ksize_;
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::vector<int> input_shape_;
+  std::vector<int> output_shape_;
+
+ protected:
+  size_t getSerializationSize() override {
+    return SerializedSize(ceil_mode_) + SerializedSize(ksize_) +
+           SerializedSize(strides_) + SerializedSize(paddings_) +
+           SerializedSize(input_shape_) + getBaseSerializationSize();
+  }
+
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
+  void serialize(void *buffer) override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, ceil_mode_);
+    SerializeValue(&buffer, ksize_);
+    SerializeValue(&buffer, strides_);
+    SerializeValue(&buffer, paddings_);
+    SerializeValue(&buffer, input_shape_);
+  }
+
+ public:
+  AvgPoolPlugin(bool ceil_mode, std::vector<int> ksize,
+                std::vector<int> strides, std::vector<int> paddings,
+                std::vector<int> input_shape)
+      : ceil_mode_(ceil_mode),
+        ksize_(ksize),
+        strides_(strides),
+        paddings_(paddings),
+        input_shape_(input_shape) {
+    int output_h, output_w;
+    output_shape_ = input_shape_;
+    if (!ceil_mode_) {
+      output_h =
+          (input_shape[1] - ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1;
+      output_w =
+          (input_shape[2] - ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1;
+    } else {
+      output_h =
+          (input_shape[1] - ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) /
+              strides_[0] +
+          1;
+      output_w =
+          (input_shape[2] - ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) /
+              strides_[1] +
+          1;
+    }
+    output_shape_[1] = output_h;
+    output_shape_[2] = output_w;
+  }
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  AvgPoolPlugin(void const *serialData, size_t serialLength) {
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &ceil_mode_);
+    DeserializeValue(&serialData, &serialLength, &ksize_);
+    DeserializeValue(&serialData, &serialLength, &strides_);
+    DeserializeValue(&serialData, &serialLength, &paddings_);
+    DeserializeValue(&serialData, &serialLength, &input_shape_);
+  }
+
+  AvgPoolPlugin *clone() const override {
+    return new AvgPoolPlugin(ceil_mode_, ksize_, strides_, paddings_,
+                             input_shape_);
+  }
+
+  const char *getPluginType() const override { return "avg_pool"; }
+  int getNbOutputs() const override { return 1; }
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+                                     int nbInputDims) override;
+  int initialize() override { return 0; }
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index bd6a44dcc1..14c286ff90 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <stdio.h>
 #include <cassert>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
@@ -76,6 +75,6 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
   return cudaGetLastError() != cudaSuccess;
 }
 
-}  // tensorrt
-}  // inference
-}  // paddle
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 7281e40c33..1b4ac34d31 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-
+#include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
@@ -27,7 +27,7 @@ class SplitPlugin : public PluginTensorRT {
   std::vector<int> segment_offsets_;
 
  protected:
-  virtual size_t getSerializationSize() override {
+  size_t getSerializationSize() override {
     return SerializedSize(axis_) + SerializedSize(output_length_) +
            getBaseSerializationSize();
   }
@@ -35,7 +35,7 @@ class SplitPlugin : public PluginTensorRT {
   // TRT will call this func when we need to serialize the configuration of
   // tensorrt.
   // It should not be called by users.
-  virtual void serialize(void *buffer) override {
+  void serialize(void *buffer) override {
     serializeBase(buffer);
     SerializeValue(&buffer, axis_);
     SerializeValue(&buffer, output_length_);
@@ -59,16 +59,15 @@ class SplitPlugin : public PluginTensorRT {
     return new SplitPlugin(axis_, output_length_);
   }
 
-  virtual const char *getPluginType() const override { return "split"; }
-  virtual int getNbOutputs() const override { return output_length_.size(); }
-  virtual nvinfer1::Dims getOutputDimensions(int index,
-                                             const nvinfer1::Dims *inputs,
-                                             int nbInputDims) override;
-  virtual int initialize() override;
-  virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
-                      void *workspace, cudaStream_t stream) override;
+  const char *getPluginType() const override { return "split"; }
+  int getNbOutputs() const override { return output_length_.size(); }
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+                                     int nbInputDims) override;
+  int initialize() override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
 };
 
-}  // tensorrt
-}  // inference
-}  // paddle
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index a689eb4224..cdc79e207a 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -153,6 +153,37 @@ __global__ void KernelMaxPool2DGrad(
   }
 }
 
+template <typename PoolProcess, typename T>
+void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
+    const T* input, const std::vector<int>& input_shape,
+    const std::vector<int>& output_shape, const std::vector<int>& ksize,
+    const std::vector<int>& strides, const std::vector<int>& paddings,
+    PoolProcess pool_compute, bool exclusive, T* output, cudaStream_t stream) {
+  const int batch_size = input_shape[0];
+  const int input_channels = input_shape[1];
+  const int input_height = input_shape[2];
+  const int input_width = input_shape[3];
+  const int output_channels = output_shape[1];
+  const int output_height = output_shape[2];
+  const int output_width = output_shape[3];
+  const int ksize_height = ksize[0];
+  const int ksize_width = ksize[1];
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+
+  int nthreads = batch_size * output_channels * output_height * output_width;
+  int blocks = (nthreads + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
+      nthreads, input, input_channels, input_height, input_width, output_height,
+      output_width, ksize_height, ksize_width, stride_height, stride_width,
+      padding_height, padding_width, pool_compute, exclusive, output);
+}
+
 /*
  * All tensors are in NCHW format.
  * Ksize, strides, paddings are two elements. These two elements represent
@@ -291,6 +322,11 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template class Pool2dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
+                                       float>;
+template class Pool2dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
+                                       float>;
+
 template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
 
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 0f64e321bf..fa732f96d4 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -82,6 +82,18 @@ class AvgPoolGrad {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
+
+template <typename PoolProcess, typename T>
+class Pool2dDirectCUDAFunctor {
+ public:
+  void operator()(const T* input, const std::vector<int>& input_shape,
+                  const std::vector<int>& output_shape,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_compute,
+                  bool exclusive, T* output, cudaStream_t stream);
+};
+
 template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dFunctor {
  public:

From 8f9a8c455a2ec22f5f67cc464d5b6a82cafbfb57 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Fri, 16 Nov 2018 08:14:04 +0000
Subject: [PATCH 03/80] delete unused test code.

test=develop
---
 paddle/fluid/inference/tensorrt/convert/pool2d_op.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index db8e7f8438..2cfd0f6905 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -102,8 +102,7 @@ class Pool2dOpConverter : public OpConverter {
       layer->setName(("pool2d (Output: " + output_name + ")").c_str());
       layer->getOutput(0)->setName(output_name.c_str());
       engine_->SetITensor(output_name, layer->getOutput(0));
-      if (test_mode ||
-          output_name == "patch_6_pool1.avg_pool.output.1.tmp_0702") {
+      if (test_mode) {
         engine_->DeclareOutput(output_name);
       }
       return;
@@ -148,8 +147,7 @@ class Pool2dOpConverter : public OpConverter {
     layer->setName(("pool2d (Output: " + output_name + ")").c_str());
     layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode ||
-        output_name == "patch_6_pool1.avg_pool.output.1.tmp_0702") {
+    if (test_mode) {
       engine_->DeclareOutput(output_name);
     }
   }

From 9b64aac41ffa89cd742c9a926591a4607b3c15ed Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Fri, 16 Nov 2018 09:54:36 +0000
Subject: [PATCH 04/80] add macro for pool2dDirectCUDAFunctor

test=develop
---
 paddle/fluid/operators/math/pooling.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index fa732f96d4..923babd4c2 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -82,7 +82,7 @@ class AvgPoolGrad {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
-
+#ifdef PADDLE_WITH_CUDA
 template <typename PoolProcess, typename T>
 class Pool2dDirectCUDAFunctor {
  public:
@@ -93,6 +93,7 @@ class Pool2dDirectCUDAFunctor {
                   const std::vector<int>& paddings, PoolProcess pool_compute,
                   bool exclusive, T* output, cudaStream_t stream);
 };
+#endif
 
 template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dFunctor {

From 513bb6c1513dde0e3b9e2b9da5acccd9649cda0d Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Thu, 8 Nov 2018 17:16:16 +0100
Subject: [PATCH 05/80] Squashing MKL based softmax for inference

test=develop

- Added profiling to softmax functors

- MKL based softmax inference op

- Fix to softmax compuation via MKL

- cleaning

- Cosmetic fixes to softmax MKL

- Fix to ON_INFER lack of propagation
---
 CMakeLists.txt                             | 15 +++---
 paddle/fluid/operators/math/softmax_impl.h | 59 ++++++++++++----------
 paddle/fluid/operators/softmax_op.h        |  2 +-
 3 files changed, 42 insertions(+), 34 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cfec8e70b..c62cc9bfd7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -302,6 +302,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
+if (ON_INFER)
+    message(STATUS "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+else()
+    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+endif()
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
     add_subdirectory(python)
@@ -312,10 +320,3 @@ if(WITH_DOC)
     find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
-
-if (ON_INFER)
-    message(STATUS "On inference mode, will take place some specific optimization.")
-else()
-    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
-    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
-endif()
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 7cf98f2725..e09a243347 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 
+#include "paddle/fluid/operators/math/blas.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -65,36 +66,42 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
                                                  .broadcast(one_by_class));
 }
 
-template <typename DeviceContext, typename T>
-class SoftmaxFunctor<DeviceContext, T, true> {
+template <typename DeviceContext>
+class SoftmaxFunctor<DeviceContext, float, true> {
   void operator()(const DeviceContext& context, const framework::Tensor* X,
                   framework::Tensor* Y) {
-    auto logits = EigenMatrix<T>::From(*X);
-    auto softmax = EigenMatrix<T>::From(*Y);
-
+    auto in_dims = X->dims();
+    auto out_dims = Y->dims();
+    const float* in_data = X->data<float>();
+    float* out_data = Y->data<float>();
     const int kBatchDim = 0;
     const int kClassDim = 1;
-
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
-
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-
-    auto shifted_logits = (logits -
-                           logits.maximum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
-
-    softmax.device(*context.eigen_device()) = shifted_logits.exp();
-    softmax.device(*context.eigen_device()) = (softmax *
-                                               softmax.sum(along_class)
-                                                   .inverse()
-                                                   .eval()
-                                                   .reshape(batch_by_one)
-                                                   .broadcast(one_by_class));
+    // 2D data. Batch x C
+    const int batch_size = in_dims[kBatchDim];
+    const int num_classes = in_dims[kClassDim];
+    std::vector<float> entities(batch_size);
+    auto blas = math::GetBlas<DeviceContext, float>(context);
+    for (int n = 0; n < batch_size; ++n) {
+      entities[n] = in_data[n * num_classes];
+      for (int c = 1; c < num_classes; ++c) {
+        entities[n] = in_data[n * num_classes + c] > entities[n]
+                          ? in_data[n * num_classes + c]
+                          : entities[n];
+      }
+      for (int c = 0; c < num_classes; ++c) {
+        out_data[n * num_classes + c] =
+            in_data[n * num_classes + c] - entities[n];
+      }
+    }
+
+    blas.VEXP(num_classes * batch_size, out_data, out_data);
+    for (int n = 0; n < batch_size; ++n) {
+      entities[n] = out_data[n * num_classes];
+      for (int c = 1; c < num_classes; ++c) {
+        entities[n] += out_data[n * num_classes + c];
+      }
+      blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 2fea8a65bc..91829d5761 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -35,7 +35,7 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
     Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
     math::SoftmaxFunctor<DeviceContext, T, true>()(
         context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
 #else

From 5d0ba9da74bfa831908b1332839f5c26871a027b Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 16 Nov 2018 20:19:35 +0800
Subject: [PATCH 06/80] Add python3.6 python3.7 support to manylinux Dockerfile

test=develop
---
 tools/manylinux1/Dockerfile.x64               |  6 +++++-
 tools/manylinux1/build_scripts/build.sh       |  8 +++++---
 tools/manylinux1/build_scripts/build_utils.sh | 14 ++++++++++++--
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index 0d59e4c110..4468220a4d 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -41,12 +41,16 @@ RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddl
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index eb4b477dcb..c0f01601c8 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -9,7 +9,7 @@ set -ex
 # remove others to expedite build and reduce docker image size. The original
 # manylinux docker image project builds many python versions.
 # NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="2.7.11 3.5.1"
+CPYTHON_VERSIONS="3.7.0 3.6.0 3.5.1 2.7.11"
 
 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
@@ -25,7 +25,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
 
 # Dependencies for compiling Python that we want to remove from
 # the final image after compiling Python
-PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-dev"
 
 # Libraries that are allowed as part of the manylinux1 profile
 MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"
@@ -77,11 +77,13 @@ mkdir -p /opt/python
 build_cpythons $CPYTHON_VERSIONS
 
 PY35_BIN=/opt/python/cp35-cp35m/bin
+PY36_BIN=/opt/python/cp36-cp36m/bin
+PY37_BIN=/opt/python/cp37-cp37m/bin
 # NOTE Since our custom manylinux image builds pythons with shared
 # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
 # python.
 ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib"
 
 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
index 10422ae3bd..942ca2b0f1 100755
--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -53,8 +53,12 @@ function do_cpython_build {
     # NOTE --enable-shared for generating libpython shared library needed for
     # linking of some of the nupic.core test executables.
     CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
-    make -j2 > /dev/null
-    make install > /dev/null
+    make -j8 > /dev/null
+    if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
+        make altinstall > /dev/null
+    else
+        make install > /dev/null
+    fi
     popd
     echo "ZZZ looking for libpython"
     find / -name 'libpython*.so*'
@@ -64,6 +68,12 @@ function do_cpython_build {
     if [ -e ${prefix}/bin/python3 ]; then
         ln -s python3 ${prefix}/bin/python
     fi
+    if [ -e ${prefix}/bin/python3.6 ]; then
+        ln -s python3.6 ${prefix}/bin/python
+    fi
+    if [ -e ${prefix}/bin/python3.7 ]; then
+        ln -s python3.7 ${prefix}/bin/python
+    fi
     # NOTE Make libpython shared library visible to python calls below
     LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
     LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel

From 213ec37d6ad84c3774f1a5e203566dc47a1b63da Mon Sep 17 00:00:00 2001
From: Tomasz Patejko <tomasz.patejko@intel.com>
Date: Thu, 25 Oct 2018 16:18:04 +0200
Subject: [PATCH 07/80] MKLDNN elementwise_add: simple initial implementation
 of the operator for MKLDNN format

---
 .../operators/elementwise_mul_mkldnn_op.cc    | 99 +++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 paddle/fluid/operators/elementwise_mul_mkldnn_op.cc

diff --git a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
new file mode 100644
index 0000000000..22289ab417
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+
+template <typename T>
+class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    int axis = ctx.Attr<int>("axis");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    const T* y_data = y->data<T>();
+    T* z_data = z->mutable_data<T>(ctx.GetPlace());
+
+    auto x_dims = x->dims();
+    auto y_dims_untrimmed = y->dims();
+
+    if (x_dims != y_dims_untrimmed) {
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post);
+
+      if (post == 1) {
+        PADDLE_THROW("Not implemented when post is 1");
+      } else {
+        // Just check whether it works for RE-Resnext.
+
+        PADDLE_ENFORCE_EQ(x_dims.size(), 4, "X should have 4 dimensions");
+
+        int n = x_dims[0];
+        int c = x_dims[1];
+        int h = x_dims[2];
+        int w = x_dims[3];
+
+        PADDLE_ENFORCE(y_dims_untrimmed[0] == n && y_dims_untrimmed[1] == c,
+                       "Y should be in nc format");
+
+        constexpr int simd_width = 16;
+        int C = c / simd_width;
+
+        for (int ni = 0; ni < n; ni++) {
+          for (int ci = 0; ci < C; ci++) {
+            for (int hi = 0; hi < h; hi++) {
+              for (int wi = 0; wi < w; wi++) {
+                auto ptr_x = x_data + ni * C * h * w * simd_width +
+                             ci * h * w * simd_width + hi * w * simd_width +
+                             wi * simd_width;
+                auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
+
+                auto ptr_z = z_data + ni * C * h * w * simd_width +
+                             ci * h * w * simd_width + hi * w * simd_width +
+                             wi * simd_width;
+
+                for (int i = 0; i < simd_width; i++) {
+                  ptr_z[i] = ptr_x[i] * ptr_y[i];
+                }
+              }
+            }
+          }
+        }
+      }
+
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
+    } else {
+      PADDLE_THROW("Not implemented when dims are equal");
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_mul, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ElementwiseMulMKLDNNKernel<float>)

From 2d73ad180ae80d1da4ae319106a22f8a11c79da9 Mon Sep 17 00:00:00 2001
From: Tomasz Patejko <tomasz.patejko@intel.com>
Date: Thu, 25 Oct 2018 17:07:17 +0200
Subject: [PATCH 08/80] MKLDNN elementwise_mul: simple xbyak version for AVX512

---
 .../operators/elementwise_mul_mkldnn_op.cc    | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
index 22289ab417..595a6232da 100644
--- a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
@@ -17,11 +17,29 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+
 namespace paddle {
 namespace operators {
 
 using framework::DataLayout;
 
+struct vector_mul : public Xbyak::CodeGenerator {
+  vector_mul() {
+    // RDI is ptr X
+    // RSI is ptr Y
+    // RDX is ptr Z
+
+    vmovups(zmm2, ptr[rdi]);
+    vmovups(zmm3, ptr[rsi]);
+    vmulps(zmm1, zmm2, zmm3);
+    vmovups(ptr[rdx], zmm1);
+
+    ret();
+  }
+};
+
 template <typename T>
 class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
  public:
@@ -61,6 +79,14 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
         constexpr int simd_width = 16;
         int C = c / simd_width;
 
+        vector_mul mul;
+
+        using mul_func_t = void (*)(const float*, const float*, float*);
+
+        mul_func_t mul_func = (mul_func_t)mul.getCode();
+
+        auto ptr_x = x_data;
+
         for (int ni = 0; ni < n; ni++) {
           for (int ci = 0; ci < C; ci++) {
             for (int hi = 0; hi < h; hi++) {
@@ -74,9 +100,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
                              ci * h * w * simd_width + hi * w * simd_width +
                              wi * simd_width;
 
-                for (int i = 0; i < simd_width; i++) {
-                  ptr_z[i] = ptr_x[i] * ptr_y[i];
-                }
+                mul_func(ptr_x, ptr_y, ptr_z);
               }
             }
           }

From ad09facafecfd7157ea18d3b433c15135d914978 Mon Sep 17 00:00:00 2001
From: Tomasz Patejko <tomasz.patejko@intel.com>
Date: Fri, 26 Oct 2018 14:01:44 +0200
Subject: [PATCH 09/80] MKLDNN elementwise_mul: CPU tests initially refactored.
 MKLDNN mul test for broadcast added

---
 .../operators/elementwise_mul_mkldnn_op.cc    |  2 -
 .../unittests/test_elementwise_add_op.py      |  6 ---
 .../test_elementwise_mul_mkldnn_op.py         | 50 +++++++++++++++++++
 .../unittests/test_elementwise_mul_op.py      | 44 +++++++++++-----
 4 files changed, 81 insertions(+), 21 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py

diff --git a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
index 595a6232da..13e4cc04df 100644
--- a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
@@ -85,8 +85,6 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
 
         mul_func_t mul_func = (mul_func_t)mul.getCode();
 
-        auto ptr_x = x_data;
-
         for (int ni = 0; ni < n; ni++) {
           for (int ci = 0; ci < C; ci++) {
             for (int hi = 0; hi < h; hi++) {
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 5aec5d8e38..d71a9c0151 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -43,19 +43,13 @@ class TestElementwiseAddOp(OpTest):
         self.check_output()
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad(
             ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
new file mode 100644
index 0000000000..a0581d16de
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
@@ -0,0 +1,50 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from test_elementwise_mul_op import *
+
+
+class ElementwiseMulMKLDNNOp(ElementwiseMulOp):
+    def init_input_output(self):
+        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+        self.y = np.random.rand(1, 16).astype(self.dtype)
+
+        self.out = x * self.y.reshape(1, 16, 1, 1)
+        self.out = self.out.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 53409e436c..57ba34f833 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -21,13 +21,24 @@ from paddle.fluid.op import Operator
 
 
 class ElementwiseMulOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float64")
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
         }
-        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+        self.outputs = {'Out': self.out}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
 
     def test_check_output(self):
         self.check_output()
@@ -41,6 +52,17 @@ class ElementwiseMulOp(OpTest):
     def test_check_grad_ingore_y(self):
         self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
 
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.multiply(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
 
 class TestElementwiseMulOp_scalar(ElementwiseMulOp):
     def setUp(self):
@@ -63,17 +85,13 @@ class TestElementwiseMulOp_Vector(ElementwiseMulOp):
 
 
 class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
-    def setUp(self):
-        self.op_type = "elementwise_mul"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float64),
-            'Y': np.random.rand(2).astype(np.float64)
-        }
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x * self.y.reshape(2, 1, 1)
 
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out': self.inputs['X'] * self.inputs['Y'].reshape(2, 1, 1)
-        }
+    def init_axis(self):
+        self.axis = 0
 
 
 class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):

From 700bcbf74fa5c7b43fa183063e9bbdfc2bd23265 Mon Sep 17 00:00:00 2001
From: Tomasz Patejko <tomasz.patejko@intel.com>
Date: Sun, 28 Oct 2018 02:00:34 +0100
Subject: [PATCH 10/80] MKLDNN elementwise_mul: h and w loops implemented in
 xbyak

---
 .../operators/elementwise_mul_mkldnn_op.cc    | 58 +++++++++++++------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
index 13e4cc04df..21716e271d 100644
--- a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
@@ -30,16 +30,42 @@ struct vector_mul : public Xbyak::CodeGenerator {
     // RDI is ptr X
     // RSI is ptr Y
     // RDX is ptr Z
+    // RCX is h
+    // r8 is w
 
-    vmovups(zmm2, ptr[rdi]);
+    push(rbx);
+
+    xor_(rax, rax);
+    xor_(r10, r10);
     vmovups(zmm3, ptr[rsi]);
-    vmulps(zmm1, zmm2, zmm3);
-    vmovups(ptr[rdx], zmm1);
 
+    L("h_loop");
+    xor_(rbx, rbx);
+    L("w_loop");
+    vmovups(zmm2, ptr[rdi + rax]);
+    vmulps(zmm1, zmm2, zmm3);
+    vmovups(ptr[rdx + rax], zmm1);
+    add(rax, 64);
+    inc(rbx);
+    cmp(r8, rbx);
+    jnz("w_loop");
+    inc(r10);
+    cmp(r10, rcx);
+    jnz("h_loop");
+
+    pop(rbx);
     ret();
   }
 };
 
+void check(const float* x, const float* y, float* z, int w) {
+  for (int wi = 0; wi < w; wi++) {
+    for (int i = 0; i < 16; i++) {
+      z[wi * 16 + i] = x[wi * 16 + i] * y[i];
+    }
+  }
+}
+
 template <typename T>
 class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
  public:
@@ -65,7 +91,6 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
         PADDLE_THROW("Not implemented when post is 1");
       } else {
         // Just check whether it works for RE-Resnext.
-
         PADDLE_ENFORCE_EQ(x_dims.size(), 4, "X should have 4 dimensions");
 
         int n = x_dims[0];
@@ -81,26 +106,21 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
 
         vector_mul mul;
 
-        using mul_func_t = void (*)(const float*, const float*, float*);
+        using mul_func_t =
+            void (*)(const float*, const float*, float*, int, int);
 
         mul_func_t mul_func = (mul_func_t)mul.getCode();
 
         for (int ni = 0; ni < n; ni++) {
           for (int ci = 0; ci < C; ci++) {
-            for (int hi = 0; hi < h; hi++) {
-              for (int wi = 0; wi < w; wi++) {
-                auto ptr_x = x_data + ni * C * h * w * simd_width +
-                             ci * h * w * simd_width + hi * w * simd_width +
-                             wi * simd_width;
-                auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
-
-                auto ptr_z = z_data + ni * C * h * w * simd_width +
-                             ci * h * w * simd_width + hi * w * simd_width +
-                             wi * simd_width;
-
-                mul_func(ptr_x, ptr_y, ptr_z);
-              }
-            }
+            auto ptr_x =
+                x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+
+            auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
+            auto ptr_z =
+                z_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+
+            mul_func(ptr_x, ptr_y, ptr_z, h, w);
           }
         }
       }

From 4e54ab76ecb7e86dcfbfd59824bc2c5593513809 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Tue, 6 Nov 2018 10:57:15 +0100
Subject: [PATCH 11/80] Add HasAttr method to Operator

---
 paddle/fluid/framework/operator.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 40b0130b26..6918e030bf 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -100,6 +100,7 @@ class OperatorBase {
 
   const std::string& Type() const { return type_; }
 
+  bool HasAttr(const std::string& name) const { return attrs_.count(name); }
   template <typename T>
   inline const T& Attr(const std::string& name) const {
     PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",

From ed31936ba1343a84460d2fd1883f75e0951ce353 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Tue, 6 Nov 2018 11:04:39 +0100
Subject: [PATCH 12/80] MKLDNN elementwise_mul: Support NCHW, update UT

---
 .../operators/elementwise/elementwise_op.h    |  14 ++
 .../operators/elementwise_mul_mkldnn_op.cc    | 124 +++++++++++++-----
 .../test_elementwise_mul_mkldnn_op.py         |  29 +++-
 3 files changed, 135 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index f01f67692e..16d919689c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -97,6 +97,20 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
         .EqualGreaterThan(-1);
     AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
         .SetDefault(false);
+    AddAttr<std::string>(
+      "x_data_format",
+      "(string, default NCHW) Only used in mkldnn"
+      "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". "
+      "Defaults to \"\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("");
+    AddAttr<std::string>(
+      "y_data_format",
+      "(string, default \"\") Only used in mkldnn"
+      "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". "
+      "Defaults to \"\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("");
     AddComment(string::Sprintf(R"DOC(
 Elementwise %s Operator
 
diff --git a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
index 21716e271d..d66c58bd45 100644
--- a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <mkldnn/include/mkldnn.hpp>
 #include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 
@@ -24,6 +25,7 @@ namespace paddle {
 namespace operators {
 
 using framework::DataLayout;
+using mkldnn::memory;
 
 struct vector_mul : public Xbyak::CodeGenerator {
   vector_mul() {
@@ -66,6 +68,33 @@ void check(const float* x, const float* y, float* z, int w) {
   }
 }
 
+static mkldnn::memory::format StringToMKLDNNFormat(std::string& format) {
+  std::transform(format.begin(), format.end(), format.begin(), ::tolower);
+
+  if(!format.compare("nchw")) {
+    return memory::format::nchw;
+  } else if(!format.compare("nchw16c")) {
+    return memory::format::nChw16c;
+  } else if(!format.compare("nchw8c")) {
+    return memory::format::nChw8c;
+  } else if(!format.compare("nhwc")) {
+    return memory::format::nhwc;
+  } else {
+    return memory::format::any;
+  }
+}
+
+static void UpdateDataFormat(const framework::ExecutionContext& ctx,
+  framework::Tensor* tensor, const char* attribute) {
+  if(ctx.op().HasAttr(attribute)) {
+    auto format_as_string = ctx.Attr<std::string>(attribute);
+    auto format = StringToMKLDNNFormat(format_as_string);
+    if (format != memory::format::any) {
+      tensor->set_format(format);
+    }
+  }
+}
+
 template <typename T>
 class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
  public:
@@ -83,52 +112,87 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
     auto x_dims = x->dims();
     auto y_dims_untrimmed = y->dims();
 
-    if (x_dims != y_dims_untrimmed) {
-      int pre, n, post;
-      get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post);
+    UpdateDataFormat(ctx, (Tensor*)x, "x_data_format");
+    UpdateDataFormat(ctx, (Tensor*)y, "y_data_format");
 
-      if (post == 1) {
-        PADDLE_THROW("Not implemented when post is 1");
-      } else {
-        // Just check whether it works for RE-Resnext.
-        PADDLE_ENFORCE_EQ(x_dims.size(), 4, "X should have 4 dimensions");
+    if (x->format() == memory::format::nChw16c && y->format() == memory::format::nc) {
+      if (x_dims != y_dims_untrimmed) {
+        int pre, n, post;
+        get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post);
+
+        if (post == 1) {
+          PADDLE_THROW("Not implemented when post is 1");
+        } else {
+          // Just check whether it works for RE-Resnext.
+          PADDLE_ENFORCE_EQ(x_dims.size(), 4, "X should have 4 dimensions");
 
-        int n = x_dims[0];
-        int c = x_dims[1];
-        int h = x_dims[2];
-        int w = x_dims[3];
+          int n = x_dims[0];
+          int c = x_dims[1];
+          int h = x_dims[2];
+          int w = x_dims[3];
 
-        PADDLE_ENFORCE(y_dims_untrimmed[0] == n && y_dims_untrimmed[1] == c,
-                       "Y should be in nc format");
+          PADDLE_ENFORCE(y_dims_untrimmed[0] == n && y_dims_untrimmed[1] == c,
+                         "Y should be in nc format");
 
-        constexpr int simd_width = 16;
-        int C = c / simd_width;
+          constexpr int simd_width = 16;
+          int C = c / simd_width;
 
-        vector_mul mul;
+          vector_mul mul;
 
-        using mul_func_t =
-            void (*)(const float*, const float*, float*, int, int);
+          using mul_func_t =
+          void (*)(const float *, const float *, float *, int, int);
 
-        mul_func_t mul_func = (mul_func_t)mul.getCode();
+          mul_func_t mul_func = (mul_func_t) mul.getCode();
 
-        for (int ni = 0; ni < n; ni++) {
-          for (int ci = 0; ci < C; ci++) {
-            auto ptr_x =
-                x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+          for (int ni = 0; ni < n; ni++) {
+            for (int ci = 0; ci < C; ci++) {
+              auto ptr_x =
+                      x_data + ni * C * h * w * simd_width +
+                      ci * h * w * simd_width;
 
-            auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
-            auto ptr_z =
-                z_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+              auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
+              auto ptr_z =
+                      z_data + ni * C * h * w * simd_width +
+                      ci * h * w * simd_width;
 
-            mul_func(ptr_x, ptr_y, ptr_z, h, w);
+              mul_func(ptr_x, ptr_y, ptr_z, h, w);
+            }
           }
         }
+
+        z->set_layout(DataLayout::kMKLDNN);
+        z->set_format(x->format());
+      } else {
+        PADDLE_THROW("Not implemented when dims are equal");
       }
+    } else {
+      // Fallback to naive version:
+      auto mul_func = [](T a, T b) -> T { return a * b; };
+
+      TransformFunctor<decltype(mul_func), T,
+                       paddle::platform::CPUDeviceContext, T>
+          functor(
+              x, y, z,
+              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
+              mul_func);
 
+      axis = (axis == -1 ? x_dims.size() - y_dims_untrimmed.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                     "Axis should be in range [0, x_dims)");
+
+      auto y_dims = trim_trailing_singular_dims(y_dims_untrimmed);
+      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+
+      if (post == 1) {
+        functor.RunRowWise(n, pre);
+      } else {
+        functor.RunMidWise(n, pre, post);
+      }
       z->set_layout(DataLayout::kMKLDNN);
       z->set_format(x->format());
-    } else {
-      PADDLE_THROW("Not implemented when dims are equal");
     }
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
index a0581d16de..a89f439664 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
@@ -20,8 +20,7 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from test_elementwise_mul_op import *
 
-
-class ElementwiseMulMKLDNNOp(ElementwiseMulOp):
+class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
     def init_input_output(self):
         x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
         self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
@@ -30,6 +29,11 @@ class ElementwiseMulMKLDNNOp(ElementwiseMulOp):
         self.out = x * self.y.reshape(1, 16, 1, 1)
         self.out = self.out.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
 
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_BroadcastNCHW16c, self).setUp()
+        self.attrs["x_data_format"] = "nchw16c"
+        self.attrs["y_data_format"] = "nc"
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -45,6 +49,27 @@ class ElementwiseMulMKLDNNOp(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
+class TestElementwiseMulMKLDNNOp_UnsupportedFormat(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.y = np.random.rand(1, 16).astype(self.dtype)
+
+        self.out = self.x * self.y.reshape(1, 16, 1, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
 
 if __name__ == '__main__':
     unittest.main()

From d14858e4baf0aaeeaa9ccd33623958de6f4a6bd4 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Tue, 6 Nov 2018 12:52:44 +0100
Subject: [PATCH 13/80] MKLDNN elementwise_mul: Parallelize mul

---
 paddle/fluid/operators/elementwise_mul_mkldnn_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
index d66c58bd45..36e88cd789 100644
--- a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
@@ -144,6 +144,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
 
           mul_func_t mul_func = (mul_func_t) mul.getCode();
 
+          #pragma omp parallel for collapse(2)
           for (int ni = 0; ni < n; ni++) {
             for (int ci = 0; ci < C; ci++) {
               auto ptr_x =

From f820573b9c6ffee12aaf64b656d902dc0c9532f5 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Wed, 7 Nov 2018 11:37:27 +0100
Subject: [PATCH 14/80] MKLDNN elementwise_mul: Add UTs

---
 .../test_elementwise_mul_mkldnn_op.py         | 119 +++++++++++++++++-
 1 file changed, 118 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
index a89f439664..a008979801 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
@@ -49,7 +49,37 @@ class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
-class TestElementwiseMulMKLDNNOp_UnsupportedFormat(ElementwiseMulOp):
+@unittest.skip("Not implemented yet.")
+class TestElementwiseMulMKLDNNOp_BroadcastNCHW8c(ElementwiseMulOp):
+    def init_input_output(self):
+        x = np.random.rand(1, 8, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 8, 2, 2)
+        self.y = np.random.rand(1, 8).astype(self.dtype)
+
+        self.out = x * self.y.reshape(1, 8, 1, 1)
+        self.out = self.out.transpose(0, 2, 3, 1).reshape(1, 8, 2, 2)
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_BroadcastNCHW8c, self).setUp()
+        self.attrs["x_data_format"] = "nchw8c"
+        self.attrs["y_data_format"] = "nc"
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+class TestElementwiseMulMKLDNNOp_FallbackNCHW(ElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
         self.y = np.random.rand(1, 16).astype(self.dtype)
@@ -71,5 +101,92 @@ class TestElementwiseMulMKLDNNOp_UnsupportedFormat(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
+class TestElementwiseMulMKLDNNOp_FallbackNCHW16C(ElementwiseMulOp):
+    def init_input_output(self):
+        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+        y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
+        self.out = self.x * self.y
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_FallbackNCHW16C, self).setUp()
+        self.attrs["x_data_format"] = "nchw16c"
+        self.attrs["y_data_format"] = "nchw16c"
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+class TestElementwiseMulMKLDNNOp_FallbackNoReorders(ElementwiseMulOp):
+    def init_input_output(self):
+        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+        y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
+        self.out = self.x * self.y
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_FallbackNoReorders, self).setUp()
+        self.attrs["x_data_format"] = "nchw16c"
+        self.attrs["y_data_format"] = "nchw16c"
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+@unittest.skip("Not implemented yet.")
+class TestElementwiseMulMKLDNNOp_FallbackWithReorder(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
+        self.out = self.x * y
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_FallbackNCHW16C, self).setUp()
+        self.attrs["x_data_format"] = "nchw"
+        self.attrs["y_data_format"] = "nchw16c"
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
 if __name__ == '__main__':
     unittest.main()

From 49b09327f673598dfaeac4bcc2613d50228b2a73 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Fri, 9 Nov 2018 15:21:07 +0100
Subject: [PATCH 15/80] MKLDNN elementwise_mul: Reorder on non-nchw input,
 fallback on non-16 divisable fm

test=develop
---
 .../operators/elementwise_mul_mkldnn_op.cc    | 111 ++++++++++++------
 .../test_elementwise_mul_mkldnn_op.py         |  62 +++++++++-
 2 files changed, 131 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
index 36e88cd789..58aadd0033 100644
--- a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
@@ -95,6 +95,26 @@ static void UpdateDataFormat(const framework::ExecutionContext& ctx,
   }
 }
 
+template <typename T>
+static void ReorderInput(framework::Tensor* tensor,
+                         const platform::Place& place,
+                         const mkldnn::engine& engine,
+                         bool isFourDim) {
+  using platform::to_void_cast;
+  auto dims = paddle::framework::vectorize2int(tensor->dims());
+  framework::Tensor out_tensor;
+  out_tensor.Resize(tensor->dims());
+  out_tensor.set_format(isFourDim ? memory::format::nchw : memory::format::nc);
+  out_tensor.set_layout(tensor->layout());
+  mkldnn::memory input_memory = {{{dims, platform::MKLDNNGetDataType<T>(),
+           tensor->format()}, engine}, to_void_cast<T>(tensor->data<T>())};
+  mkldnn::memory output_memory = {{{dims, platform::MKLDNNGetDataType<T>(),
+           out_tensor.format()}, engine},
+           to_void_cast<T>(out_tensor.mutable_data<T>(place))};
+  platform::Reorder(input_memory, output_memory);
+  tensor->ShareDataWith(out_tensor);
+}
+
 template <typename T>
 class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
  public:
@@ -111,63 +131,78 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
 
     auto x_dims = x->dims();
     auto y_dims_untrimmed = y->dims();
+    auto x_int_dims = paddle::framework::vectorize2int(x_dims);
 
     UpdateDataFormat(ctx, (Tensor*)x, "x_data_format");
     UpdateDataFormat(ctx, (Tensor*)y, "y_data_format");
 
-    if (x->format() == memory::format::nChw16c && y->format() == memory::format::nc) {
-      if (x_dims != y_dims_untrimmed) {
-        int pre, n, post;
-        get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post);
+    const bool are_dims_divisable = !(x_int_dims[1] % 16);
+    const bool is_x_format_correct = x->format() == memory::format::nChw16c;
+    const bool is_y_format_correct = y->format() == memory::format::nc;
+    if (is_x_format_correct && is_y_format_correct && are_dims_divisable) {
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post);
 
-        if (post == 1) {
-          PADDLE_THROW("Not implemented when post is 1");
-        } else {
-          // Just check whether it works for RE-Resnext.
-          PADDLE_ENFORCE_EQ(x_dims.size(), 4, "X should have 4 dimensions");
+      if (post == 1) {
+        PADDLE_THROW("Not implemented when post is 1");
+      } else {
+        // Just check whether it works for RE-Resnext.
+        PADDLE_ENFORCE_EQ(x_dims.size(), 4, "X should have 4 dimensions");
 
-          int n = x_dims[0];
-          int c = x_dims[1];
-          int h = x_dims[2];
-          int w = x_dims[3];
+        int n = x_dims[0];
+        int c = x_dims[1];
+        int h = x_dims[2];
+        int w = x_dims[3];
 
-          PADDLE_ENFORCE(y_dims_untrimmed[0] == n && y_dims_untrimmed[1] == c,
-                         "Y should be in nc format");
+        PADDLE_ENFORCE(y_dims_untrimmed[0] == n && y_dims_untrimmed[1] == c,
+                       "Y should be in nc format");
 
-          constexpr int simd_width = 16;
-          int C = c / simd_width;
+        constexpr int simd_width = 16;
+        int C = c / simd_width;
 
-          vector_mul mul;
+        vector_mul mul;
 
-          using mul_func_t =
-          void (*)(const float *, const float *, float *, int, int);
+        using mul_func_t =
+        void (*)(const float *, const float *, float *, int, int);
 
-          mul_func_t mul_func = (mul_func_t) mul.getCode();
+        mul_func_t mul_func = (mul_func_t) mul.getCode();
 
-          #pragma omp parallel for collapse(2)
-          for (int ni = 0; ni < n; ni++) {
-            for (int ci = 0; ci < C; ci++) {
-              auto ptr_x =
-                      x_data + ni * C * h * w * simd_width +
-                      ci * h * w * simd_width;
+        #pragma omp parallel for collapse(2)
+        for (int ni = 0; ni < n; ni++) {
+          for (int ci = 0; ci < C; ci++) {
+            auto ptr_x =
+                    x_data + ni * C * h * w * simd_width +
+                    ci * h * w * simd_width;
 
-              auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
-              auto ptr_z =
-                      z_data + ni * C * h * w * simd_width +
-                      ci * h * w * simd_width;
+            auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
+            auto ptr_z =
+                    z_data + ni * C * h * w * simd_width +
+                    ci * h * w * simd_width;
 
-              mul_func(ptr_x, ptr_y, ptr_z, h, w);
-            }
+            mul_func(ptr_x, ptr_y, ptr_z, h, w);
           }
         }
-
-        z->set_layout(DataLayout::kMKLDNN);
-        z->set_format(x->format());
-      } else {
-        PADDLE_THROW("Not implemented when dims are equal");
       }
+
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
     } else {
       // Fallback to naive version:
+      const bool are_inputs_in_same_format = x->format() == y->format();
+      const bool is_x_nchw= x->format() == memory::format::nchw;
+      const bool is_x_nc = x->format() == memory::format::nc;
+      const bool is_y_nchw= y->format() == memory::format::nchw;
+      const bool is_y_nc = y->format() == memory::format::nc;
+      if(!are_inputs_in_same_format) {
+        using platform::MKLDNNDeviceContext;
+        auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+        const auto& mkldnn_engine = dev_ctx.GetEngine();
+        if(!(is_x_nchw || is_x_nc))
+          ReorderInput<T>((Tensor*)x, ctx.GetPlace(), mkldnn_engine, x->dims().size() == 4);
+        if(!(is_y_nchw || is_y_nc))
+          ReorderInput<T>((Tensor*)y, ctx.GetPlace(), mkldnn_engine, y->dims().size() == 4);
+      }
+
       auto mul_func = [](T a, T b) -> T { return a * b; };
 
       TransformFunctor<decltype(mul_func), T,
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
index a008979801..77d24a81f2 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
@@ -49,7 +49,7 @@ class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
-@unittest.skip("Not implemented yet.")
+@unittest.skip("Not implemented yet.") # TODO(mgallus): enable when implemented.
 class TestElementwiseMulMKLDNNOp_BroadcastNCHW8c(ElementwiseMulOp):
     def init_input_output(self):
         x = np.random.rand(1, 8, 2, 2).astype(self.dtype)
@@ -159,8 +159,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNoReorders(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
-@unittest.skip("Not implemented yet.")
-class TestElementwiseMulMKLDNNOp_FallbackWithReorder(ElementwiseMulOp):
+class TestElementwiseMulMKLDNNOp_FallbackWithReorder1(ElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
         y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
@@ -169,7 +168,7 @@ class TestElementwiseMulMKLDNNOp_FallbackWithReorder(ElementwiseMulOp):
         self.out = self.x * y
 
     def setUp(self):
-        super(TestElementwiseMulMKLDNNOp_FallbackNCHW16C, self).setUp()
+        super(TestElementwiseMulMKLDNNOp_FallbackWithReorder1, self).setUp()
         self.attrs["x_data_format"] = "nchw"
         self.attrs["y_data_format"] = "nchw16c"
 
@@ -188,5 +187,60 @@ class TestElementwiseMulMKLDNNOp_FallbackWithReorder(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
+class TestElementwiseMulMKLDNNOp_FallbackWithReorder2(ElementwiseMulOp):
+    def init_input_output(self):
+        self.y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
+        self.out = x * self.y
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_FallbackWithReorder2, self).setUp()
+        self.attrs["x_data_format"] = "nchw16c"
+        self.attrs["y_data_format"] = "nchw"
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+class TestElementwiseMulMKLDNNOp_FallbackNoReorders2(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 16).astype(self.dtype)
+        self.y = np.random.rand(1, 16).astype(self.dtype)
+
+        self.out = self.x * self.y
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_FallbackNoReorders2, self).setUp()
+        self.attrs["x_data_format"] = "nc"
+        self.attrs["y_data_format"] = "nc"
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
 if __name__ == '__main__':
     unittest.main()

From 73b7cd0482c19667f8d9276da28f9945dc59175e Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Fri, 9 Nov 2018 15:43:55 +0100
Subject: [PATCH 16/80] Add Sand3r- to AUTHORS.md

test=develop
---
 AUTHORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AUTHORS.md b/AUTHORS.md
index 4060f75613..54a1097b50 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -42,6 +42,7 @@
 | QiJune | Jun Qi |
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
+| Sand3r- | Michal Gallus |
 | Superjom | Chun-Wei Yan |
 | tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |

From 08f63c4d1253007ee6290f8dfab3c31195940168 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Tue, 13 Nov 2018 09:12:10 +0100
Subject: [PATCH 17/80] MKLDNN elementwise_mul: Lint changes to UT &
 integration

test=develop
---
 .../operators/elementwise/elementwise_op.h    | 24 ++++-----
 .../operators/elementwise_mul_mkldnn_op.cc    | 54 +++++++++----------
 .../test_elementwise_mul_mkldnn_op.py         | 12 ++++-
 3 files changed, 50 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 16d919689c..85a7817be9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -98,19 +98,19 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
         .SetDefault(false);
     AddAttr<std::string>(
-      "x_data_format",
-      "(string, default NCHW) Only used in mkldnn"
-      "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". "
-      "Defaults to \"\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("");
+        "x_data_format",
+        "(string, default NCHW) Only used in mkldnn"
+        "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". "
+        "Defaults to \"\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("");
     AddAttr<std::string>(
-      "y_data_format",
-      "(string, default \"\") Only used in mkldnn"
-      "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". "
-      "Defaults to \"\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("");
+        "y_data_format",
+        "(string, default \"\") Only used in mkldnn"
+        "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". "
+        "Defaults to \"\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("");
     AddComment(string::Sprintf(R"DOC(
 Elementwise %s Operator
 
diff --git a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
index 58aadd0033..6371c9f839 100644
--- a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
@@ -71,13 +71,13 @@ void check(const float* x, const float* y, float* z, int w) {
 static mkldnn::memory::format StringToMKLDNNFormat(std::string& format) {
   std::transform(format.begin(), format.end(), format.begin(), ::tolower);
 
-  if(!format.compare("nchw")) {
+  if (!format.compare("nchw")) {
     return memory::format::nchw;
-  } else if(!format.compare("nchw16c")) {
+  } else if (!format.compare("nchw16c")) {
     return memory::format::nChw16c;
-  } else if(!format.compare("nchw8c")) {
+  } else if (!format.compare("nchw8c")) {
     return memory::format::nChw8c;
-  } else if(!format.compare("nhwc")) {
+  } else if (!format.compare("nhwc")) {
     return memory::format::nhwc;
   } else {
     return memory::format::any;
@@ -85,8 +85,8 @@ static mkldnn::memory::format StringToMKLDNNFormat(std::string& format) {
 }
 
 static void UpdateDataFormat(const framework::ExecutionContext& ctx,
-  framework::Tensor* tensor, const char* attribute) {
-  if(ctx.op().HasAttr(attribute)) {
+                             framework::Tensor* tensor, const char* attribute) {
+  if (ctx.op().HasAttr(attribute)) {
     auto format_as_string = ctx.Attr<std::string>(attribute);
     auto format = StringToMKLDNNFormat(format_as_string);
     if (format != memory::format::any) {
@@ -98,19 +98,19 @@ static void UpdateDataFormat(const framework::ExecutionContext& ctx,
 template <typename T>
 static void ReorderInput(framework::Tensor* tensor,
                          const platform::Place& place,
-                         const mkldnn::engine& engine,
-                         bool isFourDim) {
+                         const mkldnn::engine& engine, bool isFourDim) {
   using platform::to_void_cast;
   auto dims = paddle::framework::vectorize2int(tensor->dims());
   framework::Tensor out_tensor;
   out_tensor.Resize(tensor->dims());
   out_tensor.set_format(isFourDim ? memory::format::nchw : memory::format::nc);
   out_tensor.set_layout(tensor->layout());
-  mkldnn::memory input_memory = {{{dims, platform::MKLDNNGetDataType<T>(),
-           tensor->format()}, engine}, to_void_cast<T>(tensor->data<T>())};
-  mkldnn::memory output_memory = {{{dims, platform::MKLDNNGetDataType<T>(),
-           out_tensor.format()}, engine},
-           to_void_cast<T>(out_tensor.mutable_data<T>(place))};
+  mkldnn::memory input_memory = {
+      {{dims, platform::MKLDNNGetDataType<T>(), tensor->format()}, engine},
+      to_void_cast<T>(tensor->data<T>())};
+  mkldnn::memory output_memory = {
+      {{dims, platform::MKLDNNGetDataType<T>(), out_tensor.format()}, engine},
+      to_void_cast<T>(out_tensor.mutable_data<T>(place))};
   platform::Reorder(input_memory, output_memory);
   tensor->ShareDataWith(out_tensor);
 }
@@ -163,21 +163,19 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
         vector_mul mul;
 
         using mul_func_t =
-        void (*)(const float *, const float *, float *, int, int);
+            void (*)(const float*, const float*, float*, int, int);
 
-        mul_func_t mul_func = (mul_func_t) mul.getCode();
+        mul_func_t mul_func = (mul_func_t)mul.getCode();
 
-        #pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2)
         for (int ni = 0; ni < n; ni++) {
           for (int ci = 0; ci < C; ci++) {
             auto ptr_x =
-                    x_data + ni * C * h * w * simd_width +
-                    ci * h * w * simd_width;
+                x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
 
             auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
             auto ptr_z =
-                    z_data + ni * C * h * w * simd_width +
-                    ci * h * w * simd_width;
+                z_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
 
             mul_func(ptr_x, ptr_y, ptr_z, h, w);
           }
@@ -189,18 +187,20 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
     } else {
       // Fallback to naive version:
       const bool are_inputs_in_same_format = x->format() == y->format();
-      const bool is_x_nchw= x->format() == memory::format::nchw;
+      const bool is_x_nchw = x->format() == memory::format::nchw;
       const bool is_x_nc = x->format() == memory::format::nc;
-      const bool is_y_nchw= y->format() == memory::format::nchw;
+      const bool is_y_nchw = y->format() == memory::format::nchw;
       const bool is_y_nc = y->format() == memory::format::nc;
-      if(!are_inputs_in_same_format) {
+      if (!are_inputs_in_same_format) {
         using platform::MKLDNNDeviceContext;
         auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
         const auto& mkldnn_engine = dev_ctx.GetEngine();
-        if(!(is_x_nchw || is_x_nc))
-          ReorderInput<T>((Tensor*)x, ctx.GetPlace(), mkldnn_engine, x->dims().size() == 4);
-        if(!(is_y_nchw || is_y_nc))
-          ReorderInput<T>((Tensor*)y, ctx.GetPlace(), mkldnn_engine, y->dims().size() == 4);
+        if (!(is_x_nchw || is_x_nc))
+          ReorderInput<T>((Tensor*)x, ctx.GetPlace(), mkldnn_engine,
+                          x->dims().size() == 4);
+        if (!(is_y_nchw || is_y_nc))
+          ReorderInput<T>((Tensor*)y, ctx.GetPlace(), mkldnn_engine,
+                          y->dims().size() == 4);
       }
 
       auto mul_func = [](T a, T b) -> T { return a * b; };
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
index 77d24a81f2..56e2ca849a 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
@@ -20,6 +20,7 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from test_elementwise_mul_op import *
 
+
 class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
     def init_input_output(self):
         x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
@@ -49,7 +50,9 @@ class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
-@unittest.skip("Not implemented yet.") # TODO(mgallus): enable when implemented.
+
+@unittest.skip(
+    "Not implemented yet.")  # TODO(mgallus): enable when implemented.
 class TestElementwiseMulMKLDNNOp_BroadcastNCHW8c(ElementwiseMulOp):
     def init_input_output(self):
         x = np.random.rand(1, 8, 2, 2).astype(self.dtype)
@@ -79,6 +82,7 @@ class TestElementwiseMulMKLDNNOp_BroadcastNCHW8c(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
+
 class TestElementwiseMulMKLDNNOp_FallbackNCHW(ElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
@@ -101,6 +105,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNCHW(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
+
 class TestElementwiseMulMKLDNNOp_FallbackNCHW16C(ElementwiseMulOp):
     def init_input_output(self):
         x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
@@ -130,6 +135,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNCHW16C(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
+
 class TestElementwiseMulMKLDNNOp_FallbackNoReorders(ElementwiseMulOp):
     def init_input_output(self):
         x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
@@ -159,6 +165,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNoReorders(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
+
 class TestElementwiseMulMKLDNNOp_FallbackWithReorder1(ElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
@@ -187,6 +194,7 @@ class TestElementwiseMulMKLDNNOp_FallbackWithReorder1(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
+
 class TestElementwiseMulMKLDNNOp_FallbackWithReorder2(ElementwiseMulOp):
     def init_input_output(self):
         self.y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
@@ -215,6 +223,7 @@ class TestElementwiseMulMKLDNNOp_FallbackWithReorder2(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
+
 class TestElementwiseMulMKLDNNOp_FallbackNoReorders2(ElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.rand(1, 16).astype(self.dtype)
@@ -242,5 +251,6 @@ class TestElementwiseMulMKLDNNOp_FallbackNoReorders2(ElementwiseMulOp):
     def test_check_grad_ingore_y(self):
         pass
 
+
 if __name__ == '__main__':
     unittest.main()

From 785066eb8aa1ec552f3d093e8a7aa3d229700572 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Tue, 13 Nov 2018 12:12:08 +0100
Subject: [PATCH 18/80] MKLDNN elementwise_mul: Check if AVX512 is available

test=develop
---
 paddle/fluid/operators/elementwise_mul_mkldnn_op.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
index 6371c9f839..216c7ed9c6 100644
--- a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
@@ -136,10 +136,13 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
     UpdateDataFormat(ctx, (Tensor*)x, "x_data_format");
     UpdateDataFormat(ctx, (Tensor*)y, "y_data_format");
 
+    Xbyak::util::Cpu cpu;
+    const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
     const bool are_dims_divisable = !(x_int_dims[1] % 16);
     const bool is_x_format_correct = x->format() == memory::format::nChw16c;
     const bool is_y_format_correct = y->format() == memory::format::nc;
-    if (is_x_format_correct && is_y_format_correct && are_dims_divisable) {
+    if (is_x_format_correct && is_y_format_correct && are_dims_divisable &&
+        is_avx512_enabled) {
       int pre, n, post;
       get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post);
 

From 99e3e36a5701bf15e9a18f01b19a60ced78137aa Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Tue, 13 Nov 2018 15:03:14 +0100
Subject: [PATCH 19/80] MKLDNN elementwise_mul: Disable UT for CUDA

test=develop
---
 python/paddle/fluid/tests/unittests/op_test.py             | 4 +++-
 .../tests/unittests/test_elementwise_mul_mkldnn_op.py      | 7 +++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 690c4cf0ad..c195a28e45 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -362,7 +362,9 @@ class OpTest(unittest.TestCase):
             else:
                 return []
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
+        cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\
+           and not cpu_only:
             places.append(core.CUDAPlace(0))
         return places
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
index 56e2ca849a..536e9a1c58 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
@@ -34,6 +34,7 @@ class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
         super(TestElementwiseMulMKLDNNOp_BroadcastNCHW16c, self).setUp()
         self.attrs["x_data_format"] = "nchw16c"
         self.attrs["y_data_format"] = "nc"
+        self._cpu_only = True
 
     def init_kernel_type(self):
         self.use_mkldnn = True
@@ -66,6 +67,7 @@ class TestElementwiseMulMKLDNNOp_BroadcastNCHW8c(ElementwiseMulOp):
         super(TestElementwiseMulMKLDNNOp_BroadcastNCHW8c, self).setUp()
         self.attrs["x_data_format"] = "nchw8c"
         self.attrs["y_data_format"] = "nc"
+        self._cpu_only = True
 
     def init_kernel_type(self):
         self.use_mkldnn = True
@@ -119,6 +121,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNCHW16C(ElementwiseMulOp):
         super(TestElementwiseMulMKLDNNOp_FallbackNCHW16C, self).setUp()
         self.attrs["x_data_format"] = "nchw16c"
         self.attrs["y_data_format"] = "nchw16c"
+        self._cpu_only = True
 
     def init_kernel_type(self):
         self.use_mkldnn = True
@@ -149,6 +152,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNoReorders(ElementwiseMulOp):
         super(TestElementwiseMulMKLDNNOp_FallbackNoReorders, self).setUp()
         self.attrs["x_data_format"] = "nchw16c"
         self.attrs["y_data_format"] = "nchw16c"
+        self._cpu_only = True
 
     def init_kernel_type(self):
         self.use_mkldnn = True
@@ -178,6 +182,7 @@ class TestElementwiseMulMKLDNNOp_FallbackWithReorder1(ElementwiseMulOp):
         super(TestElementwiseMulMKLDNNOp_FallbackWithReorder1, self).setUp()
         self.attrs["x_data_format"] = "nchw"
         self.attrs["y_data_format"] = "nchw16c"
+        self._cpu_only = True
 
     def init_kernel_type(self):
         self.use_mkldnn = True
@@ -207,6 +212,7 @@ class TestElementwiseMulMKLDNNOp_FallbackWithReorder2(ElementwiseMulOp):
         super(TestElementwiseMulMKLDNNOp_FallbackWithReorder2, self).setUp()
         self.attrs["x_data_format"] = "nchw16c"
         self.attrs["y_data_format"] = "nchw"
+        self._cpu_only = True
 
     def init_kernel_type(self):
         self.use_mkldnn = True
@@ -235,6 +241,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNoReorders2(ElementwiseMulOp):
         super(TestElementwiseMulMKLDNNOp_FallbackNoReorders2, self).setUp()
         self.attrs["x_data_format"] = "nc"
         self.attrs["y_data_format"] = "nc"
+        self._cpu_only = True
 
     def init_kernel_type(self):
         self.use_mkldnn = True

From c69c41604e29dfc8b463cb79fc4cc1864ba15372 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Thu, 15 Nov 2018 15:14:48 +0100
Subject: [PATCH 20/80] MKLDNN elementwise_mul: Move Kernel to KernelPool to
 avoid segfaults

test=develop
---
 .../elementwise_mul_mkldnn_op.cc              | 61 +++----------------
 paddle/fluid/operators/math/jit_code.h        | 36 +++++++++++
 paddle/fluid/operators/math/jit_kernel.h      |  9 +++
 .../fluid/operators/math/jit_kernel_blas.cc   | 41 +++++++++++++
 4 files changed, 95 insertions(+), 52 deletions(-)
 rename paddle/fluid/operators/{ => elementwise}/elementwise_mul_mkldnn_op.cc (85%)

diff --git a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
similarity index 85%
rename from paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
rename to paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index 216c7ed9c6..10290a4aef 100644
--- a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <mkldnn/include/mkldnn.hpp>
-#include "paddle/fluid/operators/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include "xbyak.h"
+#include "xbyak_util.h"
 
 namespace paddle {
 namespace operators {
@@ -27,47 +28,6 @@ namespace operators {
 using framework::DataLayout;
 using mkldnn::memory;
 
-struct vector_mul : public Xbyak::CodeGenerator {
-  vector_mul() {
-    // RDI is ptr X
-    // RSI is ptr Y
-    // RDX is ptr Z
-    // RCX is h
-    // r8 is w
-
-    push(rbx);
-
-    xor_(rax, rax);
-    xor_(r10, r10);
-    vmovups(zmm3, ptr[rsi]);
-
-    L("h_loop");
-    xor_(rbx, rbx);
-    L("w_loop");
-    vmovups(zmm2, ptr[rdi + rax]);
-    vmulps(zmm1, zmm2, zmm3);
-    vmovups(ptr[rdx + rax], zmm1);
-    add(rax, 64);
-    inc(rbx);
-    cmp(r8, rbx);
-    jnz("w_loop");
-    inc(r10);
-    cmp(r10, rcx);
-    jnz("h_loop");
-
-    pop(rbx);
-    ret();
-  }
-};
-
-void check(const float* x, const float* y, float* z, int w) {
-  for (int wi = 0; wi < w; wi++) {
-    for (int i = 0; i < 16; i++) {
-      z[wi * 16 + i] = x[wi * 16 + i] * y[i];
-    }
-  }
-}
-
 static mkldnn::memory::format StringToMKLDNNFormat(std::string& format) {
   std::transform(format.begin(), format.end(), format.begin(), ::tolower);
 
@@ -163,12 +123,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
         constexpr int simd_width = 16;
         int C = c / simd_width;
 
-        vector_mul mul;
-
-        using mul_func_t =
-            void (*)(const float*, const float*, float*, int, int);
-
-        mul_func_t mul_func = (mul_func_t)mul.getCode();
+        const auto& multiply =
+            math::jitkernel::KernelPool::Instance()
+                .template Get<math::jitkernel::EltwiseMulnChw16cNCKernel<T>>(n);
 
 #pragma omp parallel for collapse(2)
         for (int ni = 0; ni < n; ni++) {
@@ -180,7 +137,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
             auto ptr_z =
                 z_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
 
-            mul_func(ptr_x, ptr_y, ptr_z, h, w);
+            multiply->Compute(ptr_x, ptr_y, ptr_z, h, w);
           }
         }
       }
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 71205b211b..dbfe629013 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -156,6 +156,42 @@ class VActJitCode : public JitCode {
   ymm_t ymm_dst = ymm_t(1);
 };
 
+#ifdef PADDLE_WITH_MKLDNN
+struct EltwiseMulnChw16cNC : public Xbyak::CodeGenerator {
+  explicit EltwiseMulnChw16cNC(size_t code_size = 256 * 1024)
+      : Xbyak::CodeGenerator(code_size) {
+    // RDI is ptr x_input
+    // RSI is ptr y_input
+    // RDX is ptr output
+    // RCX is height
+    // r8 is width
+
+    push(rbx);
+
+    xor_(rax, rax);
+    xor_(r10, r10);
+    vmovups(zmm3, ptr[rsi]);
+
+    L("h_loop");
+    xor_(rbx, rbx);
+    L("w_loop");
+    vmovups(zmm2, ptr[rdi + rax]);
+    vmulps(zmm1, zmm2, zmm3);
+    vmovups(ptr[rdx + rax], zmm1);
+    add(rax, 64);
+    inc(rbx);
+    cmp(r8, rbx);
+    jnz("w_loop");
+    inc(r10);
+    cmp(r10, rcx);
+    jnz("h_loop");
+
+    pop(rbx);
+    ret();
+  }
+};
+#endif
+
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index 4d8d3cd79a..110de3b140 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -94,6 +94,15 @@ class VAddBiasKernel : public Kernel {
   void (*Compute)(const T *, const T *, T *, int);
 };
 
+#ifdef PADDLE_WITH_MKLDNN
+template <typename T>
+class EltwiseMulnChw16cNCKernel : public Kernel {
+ public:
+  // nChw16c = nChw16c .* NC
+  void (*Compute)(const float *, const float *, float *, int, int);
+};
+#endif
+
 template <typename T>
 class VActKernel : public Kernel {
  public:
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
index 36a50f2043..a143b51439 100644
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -226,6 +226,44 @@ bool VAddKernelImpl<double>::useMKL(int d) {
 }
 #endif
 
+#ifdef PADDLE_WITH_MKLDNN
+/* EltwiseMul for nChw16c & NC inputs JitKernel */
+template <typename T>
+class EltwiseMulnChw16cNCKernelImpl
+    : public math::jitkernel::EltwiseMulnChw16cNCKernel<T> {
+ public:
+  JITKERNEL_DECLARE_STATIC_FUNC;
+  explicit EltwiseMulnChw16cNCKernelImpl(int d)
+      : EltwiseMulnChw16cNCKernel<T>() {
+    using mul_func_t = void (*)(const float*, const float*, float*, int, int);
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(d)) {
+      // roughly estimate the size of code
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
+      sz = sz > 4096 ? sz : 4096;
+      jitcode_.reset(new gen::EltwiseMulnChw16cNC(sz));
+      this->Compute = (mul_func_t)jitcode_->getCode();
+      return;
+    }
+#endif
+    PADDLE_THROW(
+        "This kernel shouldn't be used in Non-Xbyak, Non-MKL-DNN "
+        "environemnt");
+  }
+
+#ifdef PADDLE_WITH_XBYAK
+
+ private:
+  std::unique_ptr<gen::EltwiseMulnChw16cNC> jitcode_{nullptr};
+};
+
+template <>
+bool EltwiseMulnChw16cNCKernelImpl<float>::useJIT(int d) {
+  return true;
+}
+#endif
+#endif
+
 /* VAddRelu JitKernel */
 template <typename T>
 class VAddReluKernelImpl : public VAddReluKernel<T> {
@@ -394,6 +432,9 @@ REGISTER_JITKERNEL(vscal, VScalKernel);
 REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
 REGISTER_JITKERNEL(vrelu, VReluKernel);
 REGISTER_JITKERNEL(videntity, VIdentityKernel);
+#ifdef PADDLE_WITH_MKLDNN
+REGISTER_JITKERNEL(eltwise_mul_nchw16c, EltwiseMulnChw16cNCKernel);
+#endif
 
 }  // namespace jitkernel
 }  // namespace math

From 4bf6817cbc7b98dd695bd60ac3e7ae6a460ed72f Mon Sep 17 00:00:00 2001
From: superjomn <yanchunwei@outlook.com>
Date: Fri, 16 Nov 2018 20:49:38 +0800
Subject: [PATCH 21/80] fix gpu load model

the parameters will load from CPUPlace, that will keep copying data
between CPU and GPU places.

test=develop
---
 paddle/fluid/inference/analysis/argument.h    |  1 +
 .../analysis/passes/ir_graph_build_pass.cc    | 24 ++++++++++++++-----
 .../analysis/passes/ir_graph_build_pass.h     |  8 ++++---
 .../fluid/inference/api/analysis_predictor.cc |  4 ++--
 4 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index d7a2f3d1e3..21203e2d9f 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -116,6 +116,7 @@ struct Argument {
                       std::vector<std::string>);
 
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
+  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
   DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
   DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
                       std::function<bool(const framework::ir::Node*)>);
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index a30fef08b5..d5e0d90de1 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -30,15 +30,28 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
   if (!argument->scope_valid()) {
     argument->SetScope(new framework::Scope);
   }
+  PADDLE_ENFORCE(argument->use_gpu_valid());
+
+  // The load program should run on the same device with the inference program,
+  // so that the parameters will on the same device, or they will keep copying
+  // between difference devices.
+  platform::Place place;
+  if (argument->use_gpu()) {
+    PADDLE_ENFORCE(argument->gpu_device_id_valid());
+    place = platform::CUDAPlace(argument->gpu_device_id());
+  } else {
+    place = platform::CPUPlace();
+  }
 
   if (argument->model_dir_valid()) {
-    auto program = LoadModel(argument->model_dir(), argument->scope_ptr());
+    auto program =
+        LoadModel(argument->model_dir(), argument->scope_ptr(), place);
     argument->SetMainProgram(program.release());
   } else if (argument->model_program_path_valid() &&
              argument->model_params_path_valid()) {
     auto program =
         LoadModel(argument->model_program_path(), argument->model_params_path(),
-                  argument->scope_ptr());
+                  argument->scope_ptr(), place);
     argument->SetMainProgram(program.release());
   } else {
     PADDLE_THROW(
@@ -52,16 +65,15 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
 }
 
 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
-    const std::string &path, framework::Scope *scope) {
-  platform::CPUPlace place;
+    const std::string &path, framework::Scope *scope,
+    const platform::Place &place) {
   framework::Executor exe(place);
   return Load(&exe, scope, path);
 }
 
 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
     const std::string &program_path, const std::string &params_path,
-    framework::Scope *scope) {
-  platform::CPUPlace place;
+    framework::Scope *scope, const platform::Place &place) {
   framework::Executor exe(place);
   return Load(&exe, scope, program_path, params_path);
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
index 3291e4f6ad..b0a0b8b75e 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -32,11 +32,13 @@ class IrGraphBuildPass : public AnalysisPass {
   std::string repr() const override;
 
  private:
-  std::unique_ptr<framework::ProgramDesc> LoadModel(const std::string &path,
-                                                    framework::Scope *scope);
+  std::unique_ptr<framework::ProgramDesc> LoadModel(
+      const std::string &path, framework::Scope *scope,
+      const boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> &place);
   std::unique_ptr<framework::ProgramDesc> LoadModel(
       const std::string &program_path, const std::string &params_path,
-      framework::Scope *scope);
+      framework::Scope *scope,
+      const boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> &place);
 
   std::string model_binary_str_;
 };
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d19505877b..3a707907d9 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -285,6 +285,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   status_program_optimized_ = true;
 
   argument_.SetUseGPU(config_.use_gpu);
+  argument_.SetGPUDeviceId(config_.device);
   // Analyze inference_program
   if (!config_.model_dir.empty()) {
     argument_.SetModelDir(config_.model_dir);
@@ -491,8 +492,7 @@ bool AnalysisPredictor::LoadParameters() {
   }
 
   // Use NaiveExecutor to Load parameters.
-  platform::CPUPlace place;
-  framework::NaiveExecutor e(place);
+  framework::NaiveExecutor e(place_);
   e.Prepare(scope_.get(), *load_program, 0, false);
   e.Run();
   VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load";

From def272cf42e9b2ebf529b39f183874a1dede9c2a Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Fri, 16 Nov 2018 15:29:15 +0100
Subject: [PATCH 22/80] MKLDNN elementwise_mul: Revert changes to eltwise_add
 tests

---
 .../paddle/fluid/tests/unittests/test_elementwise_add_op.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index d71a9c0151..5aec5d8e38 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -43,13 +43,19 @@ class TestElementwiseAddOp(OpTest):
         self.check_output()
 
     def test_check_grad_normal(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
 
     def test_check_grad_ingore_x(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(
             ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
 

From d2c9ddbc025f7f46fea01d482d56bfb6574eaa53 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 16 Nov 2018 22:45:20 +0800
Subject: [PATCH 23/80] Polish code

test=develop
---
 tools/manylinux1/build_scripts/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index c0f01601c8..ace0bebd9d 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -25,7 +25,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
 
 # Dependencies for compiling Python that we want to remove from
 # the final image after compiling Python
-PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-dev"
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
 
 # Libraries that are allowed as part of the manylinux1 profile
 MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"

From ba3eaed7a7426a10f4a394071852c6f5d6ab8e1e Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 16 Nov 2018 09:13:34 +0000
Subject: [PATCH 24/80] exp support all size

---
 paddle/fluid/operators/math/jit_code.cc       | 114 ++++++++++++++++--
 paddle/fluid/operators/math/jit_code.h        |   8 +-
 .../fluid/operators/math/jit_kernel_test.cc   |   5 +-
 3 files changed, 113 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index e3b600d442..9efd4e8174 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -81,10 +81,10 @@ void VXXJitCode::generate() {
   }
   if (rest >= 2) {
     if (scalar_index_ != 1) {
-      vmovups(xmm_src1, ptr[param1 + offset]);
+      vmovq(xmm_src1, ptr[param1 + offset]);
     }
     if (scalar_index_ != 2) {
-      vmovups(xmm_src2, ptr[param2 + offset]);
+      vmovq(xmm_src2, ptr[param2 + offset]);
     }
     if (type_ == operand_type::mul) {
       vmulps(xmm_dst, xmm_src1, xmm_src2);
@@ -100,10 +100,10 @@ void VXXJitCode::generate() {
   }
   if (rest > 0) {
     if (scalar_index_ != 1) {
-      vmovups(xmm_src1, ptr[param1 + offset]);
+      vmovss(xmm_src1, ptr[param1 + offset]);
     }
     if (scalar_index_ != 2) {
-      vmovups(xmm_src2, ptr[param2 + offset]);
+      vmovss(xmm_src2, ptr[param2 + offset]);
     }
     if (type_ == operand_type::mul) {
       vmulss(xmm_dst, xmm_src1, xmm_src2);
@@ -179,7 +179,7 @@ bool VActJitCode::init(int d, operand_type type) {
     return ok;
   } else if (type == operand_type::exp) {
     // exp is slower than mkl when d >= 256
-    return ok && d % 8 == 0 && d < 256;
+    return ok;  //&& d % 4 == 0 && d < 256;
   } else {
     // TODO(TJ): support more
     return ok && d % 8 == 0;
@@ -190,6 +190,10 @@ void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) {
   vmaxps(ymm_dst, ymm_zero, ymm_src);
 }
 
+void VActJitCode::relu_xmm(xmm_t& xmm_dst, xmm_t& xmm_src, xmm_t& xmm_zero) {
+  vmaxps(xmm_dst, xmm_zero, xmm_src);
+}
+
 void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
                           int fy_idx, int mask_idx, int tmp_idx) {
   assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
@@ -271,6 +275,65 @@ void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
   pop(reg_ptr_global);
 }
 
+void VActJitCode::exp_xmm(xmm_t& ymm_dst, xmm_t& ymm_src, int fx_idx,
+                          int fy_idx, int mask_idx, int tmp_idx) {
+  assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
+  // check all idx can not equal
+  xmm_t ymm_fx = xmm_t(fx_idx);
+  xmm_t ymm_fy = xmm_t(fy_idx);
+  xmm_t ymm_mask = xmm_t(mask_idx);
+  xmm_t ymm_tmp = xmm_t(tmp_idx);
+  reg64_t reg_ptr_global = rax;
+  push(reg_ptr_global);
+  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
+  vminps(ymm_src, ymm_src, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
+  vmaxps(ymm_src, ymm_src, ymm_tmp);
+  // express exp(x) as exp(g + n*log(2))
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
+  vmulps(ymm_fx, ymm_src, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
+  vaddps(ymm_fx, ymm_fx, ymm_tmp);
+  vroundps(ymm_fy, ymm_fx, 0x01);
+  // if greater, substract 1
+  vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
+  vandps(ymm_mask, ymm_mask, ymm_tmp);
+  vsubps(ymm_fx, ymm_fy, ymm_mask);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
+  vmulps(ymm_fy, ymm_fx, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
+  xmm_t ymm_z = xmm_t(ymm_mask.getIdx());
+  vmulps(ymm_z, ymm_fx, ymm_tmp);
+  vsubps(ymm_src, ymm_src, ymm_fy);
+  vsubps(ymm_src, ymm_src, ymm_z);
+  vmulps(ymm_z, ymm_src, ymm_src);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
+  vmulps(ymm_dst, ymm_src, ymm_tmp);
+  for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
+       i += (YMM_FLOAT_BLOCK * sizeof(float))) {
+    vmovaps(ymm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
+    vaddps(ymm_dst, ymm_dst, ymm_tmp);
+    vmulps(ymm_dst, ymm_dst, ymm_src);
+  }
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
+  vaddps(ymm_dst, ymm_dst, ymm_tmp);
+  vmulps(ymm_dst, ymm_dst, ymm_z);
+  vaddps(ymm_dst, ymm_dst, ymm_src);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
+  vaddps(ymm_dst, ymm_dst, ymm_tmp);
+  // build 2^n
+  xmm_t ymm_int = ymm_fx;
+  vcvttps2dq(ymm_int, ymm_fx);
+  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
+  vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
+  vpaddd(ymm_int, ymm_int, ymm_tmp);
+  vpslld(ymm_int, ymm_int, 23);
+  vmulps(ymm_dst, ymm_dst, ymm_int);
+  pop(reg_ptr_global);
+}
+
 void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
                               int fy_idx, int mask_idx, int tmp_idx) {
   // y = 1 / (1 + e^-x)
@@ -343,7 +406,7 @@ void VActJitCode::generate() {
     vmovups(ptr[param2 + offset], ymm_dst);
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
-  if (type_ != operand_type::relu) {
+  if (type_ != operand_type::relu && type_ != operand_type::exp) {
     // TODO(TJ): remove me
     ret();
     return;
@@ -351,21 +414,50 @@ void VActJitCode::generate() {
   int rest = num_ % YMM_FLOAT_BLOCK;
   if (rest >= 4) {
     vmovups(xmm_src, ptr[param1 + offset]);
-    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    switch (type_) {
+      case operand_type::relu:
+        relu_xmm(xmm_dst, xmm_src, xmm_zero);
+        break;
+      case operand_type::exp:
+        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      default:
+        break;
+    }
     vmovups(ptr[param2 + offset], xmm_dst);
     offset += sizeof(float) * 4;
     rest -= 4;
   }
   if (rest >= 2) {
-    vmovups(xmm_src, ptr[param1 + offset]);
-    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    vmovq(xmm_src, ptr[param1 + offset]);
+    switch (type_) {
+      case operand_type::relu:
+        relu_xmm(xmm_dst, xmm_src, xmm_zero);
+        break;
+      case operand_type::exp:
+        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      default:
+        break;
+    }
     vmovq(ptr[param2 + offset], xmm_dst);
     offset += sizeof(float) * 2;
     rest -= 2;
   }
   if (rest > 0) {
-    vmovups(xmm_src, ptr[param1 + offset]);
-    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    // vmovups();
+    vmovss(xmm_src, ptr[param1 + offset]);
+
+    switch (type_) {
+      case operand_type::relu:
+        relu_xmm(xmm_dst, xmm_src, xmm_zero);
+        break;
+      case operand_type::exp:
+        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      default:
+        break;
+    }
     vmovss(ptr[param2 + offset], xmm_dst);
   }
   ret();
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 71205b211b..1467978f26 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -127,13 +127,17 @@ class VActJitCode : public JitCode {
   void generate() override;
 
  protected:
-  // compute relu with ymm
+  // compute relu with ymm, xmm
   void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src,
                 const Xbyak::Ymm& zero);
+  void relu_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src,
+                const Xbyak::Xmm& zero);
 
-  // compute exp with ymm
+  // compute exp with ymm, xmm
   void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
                int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  void exp_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src, int fx_idx = 2,
+               int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
 
   // compute sigmoid with ymm
   void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 5a6f87fe1f..178298bf56 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -33,6 +33,9 @@ limitations under the License. */
 
 constexpr int repeat = 20000;
 
+// TODO(TJ): benchmark and test should be seperated,
+// benchmark should verify more sizes
+
 inline double GetCurrentUS() {
   struct timeval time;
   gettimeofday(&time, NULL);
@@ -156,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
 
 TEST(JitKernel, vexp) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 128, 256}) {
+  for (int d : {7, 8, 12, 15, 16, 20, 30, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -2.f, 2.f);

From 4e67fe6a122636bc84b2f8df6d5f94feb5ed1a78 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 16 Nov 2018 10:09:40 +0000
Subject: [PATCH 25/80] refine act and vxx with all size

---
 paddle/fluid/operators/math/jit_code.cc | 147 ++++++++++--------------
 1 file changed, 60 insertions(+), 87 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 9efd4e8174..a5eef019c8 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -60,60 +60,53 @@ void VXXJitCode::generate() {
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
   int rest = num_ % YMM_FLOAT_BLOCK;
-  if (rest >= 4) {
-    if (scalar_index_ != 1) {
-      vmovups(xmm_src1, ptr[param1 + offset]);
-    }
-    if (scalar_index_ != 2) {
-      vmovups(xmm_src2, ptr[param2 + offset]);
-    }
-    if (type_ == operand_type::mul) {
-      vmulps(xmm_dst, xmm_src1, xmm_src2);
-    } else if (type_ == operand_type::add) {
-      vaddps(xmm_dst, xmm_src1, xmm_src2);
-    }
-    if (with_relu_) {
-      vmaxps(xmm_dst, xmm_zero, xmm_dst);
-    }
-    vmovups(ptr[param3 + offset], xmm_dst);
-    offset += sizeof(float) * 4;
-    rest -= 4;
-  }
-  if (rest >= 2) {
-    if (scalar_index_ != 1) {
-      vmovq(xmm_src1, ptr[param1 + offset]);
-    }
-    if (scalar_index_ != 2) {
-      vmovq(xmm_src2, ptr[param2 + offset]);
+  int block = XMM_FLOAT_BLOCK;
+  while (rest > 0) {
+    if (rest >= 4) {
+      if (scalar_index_ != 1) {
+        vmovups(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovups(xmm_src2, ptr[param2 + offset]);
+      }
+    } else if (rest >= 2) {
+      if (scalar_index_ != 1) {
+        vmovq(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovq(xmm_src2, ptr[param2 + offset]);
+      }
+    } else {
+      if (scalar_index_ != 1) {
+        vmovss(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovss(xmm_src2, ptr[param2 + offset]);
+      }
     }
-    if (type_ == operand_type::mul) {
-      vmulps(xmm_dst, xmm_src1, xmm_src2);
-    } else if (type_ == operand_type::add) {
-      vaddps(xmm_dst, xmm_src1, xmm_src2);
+    switch (type_) {
+      case operand_type::mul:
+        vmulps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      case operand_type::add:
+        vaddps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      default:
+        break;
     }
     if (with_relu_) {
       vmaxps(xmm_dst, xmm_zero, xmm_dst);
     }
-    vmovq(ptr[param3 + offset], xmm_dst);
-    offset += sizeof(float) * 2;
-    rest -= 2;
-  }
-  if (rest > 0) {
-    if (scalar_index_ != 1) {
-      vmovss(xmm_src1, ptr[param1 + offset]);
-    }
-    if (scalar_index_ != 2) {
-      vmovss(xmm_src2, ptr[param2 + offset]);
-    }
-    if (type_ == operand_type::mul) {
-      vmulss(xmm_dst, xmm_src1, xmm_src2);
-    } else if (type_ == operand_type::add) {
-      vaddss(xmm_dst, xmm_src1, xmm_src2);
+    if (rest >= 4) {
+      vmovups(ptr[param3 + offset], xmm_dst);
+    } else if (rest >= 2) {
+      vmovq(ptr[param3 + offset], xmm_dst);
+    } else {
+      vmovss(ptr[param3 + offset], xmm_dst);
     }
-    if (with_relu_) {
-      vmaxps(xmm_dst, xmm_zero, xmm_dst);
-    }
-    vmovss(ptr[param3 + offset], xmm_dst);
+    offset += sizeof(float) * block;
+    rest -= block;
+    block /= 2;
   }
   ret();
 }
@@ -175,11 +168,9 @@ static int g_tmp_mem[16] ALIGN32 = {0};
 
 bool VActJitCode::init(int d, operand_type type) {
   bool ok = MayIUse(avx);
-  if (type == operand_type::relu) {
+  if (type == operand_type::relu || type == operand_type::exp) {
+    // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
     return ok;
-  } else if (type == operand_type::exp) {
-    // exp is slower than mkl when d >= 256
-    return ok;  //&& d % 4 == 0 && d < 256;
   } else {
     // TODO(TJ): support more
     return ok && d % 8 == 0;
@@ -412,24 +403,15 @@ void VActJitCode::generate() {
     return;
   }
   int rest = num_ % YMM_FLOAT_BLOCK;
-  if (rest >= 4) {
-    vmovups(xmm_src, ptr[param1 + offset]);
-    switch (type_) {
-      case operand_type::relu:
-        relu_xmm(xmm_dst, xmm_src, xmm_zero);
-        break;
-      case operand_type::exp:
-        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      default:
-        break;
+  int block = XMM_FLOAT_BLOCK;
+  while (rest > 0) {
+    if (rest >= 4) {
+      vmovups(xmm_src, ptr[param1 + offset]);
+    } else if (rest >= 2) {
+      vmovq(xmm_src, ptr[param1 + offset]);
+    } else {
+      vmovss(xmm_src, ptr[param1 + offset]);
     }
-    vmovups(ptr[param2 + offset], xmm_dst);
-    offset += sizeof(float) * 4;
-    rest -= 4;
-  }
-  if (rest >= 2) {
-    vmovq(xmm_src, ptr[param1 + offset]);
     switch (type_) {
       case operand_type::relu:
         relu_xmm(xmm_dst, xmm_src, xmm_zero);
@@ -440,25 +422,16 @@ void VActJitCode::generate() {
       default:
         break;
     }
-    vmovq(ptr[param2 + offset], xmm_dst);
-    offset += sizeof(float) * 2;
-    rest -= 2;
-  }
-  if (rest > 0) {
-    // vmovups();
-    vmovss(xmm_src, ptr[param1 + offset]);
-
-    switch (type_) {
-      case operand_type::relu:
-        relu_xmm(xmm_dst, xmm_src, xmm_zero);
-        break;
-      case operand_type::exp:
-        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      default:
-        break;
+    if (rest >= 4) {
+      vmovups(ptr[param2 + offset], xmm_dst);
+    } else if (rest >= 2) {
+      vmovq(ptr[param2 + offset], xmm_dst);
+    } else {
+      vmovss(ptr[param2 + offset], xmm_dst);
     }
-    vmovss(ptr[param2 + offset], xmm_dst);
+    offset += sizeof(float) * block;
+    rest -= block;
+    block /= 2;
   }
   ret();
 }

From d3eae8f61b26c4fa053a74ce35aeb241db2c3b3b Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 16 Nov 2018 14:58:43 +0000
Subject: [PATCH 26/80] refine relu and fix addrelu test

---
 paddle/fluid/operators/math/jit_code.cc        | 12 ++----------
 paddle/fluid/operators/math/jit_code.h         |  8 ++++----
 paddle/fluid/operators/math/jit_kernel_test.cc |  2 +-
 3 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index a5eef019c8..2a10cd7821 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -177,14 +177,6 @@ bool VActJitCode::init(int d, operand_type type) {
   }
 }
 
-void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) {
-  vmaxps(ymm_dst, ymm_zero, ymm_src);
-}
-
-void VActJitCode::relu_xmm(xmm_t& xmm_dst, xmm_t& xmm_src, xmm_t& xmm_zero) {
-  vmaxps(xmm_dst, xmm_zero, xmm_src);
-}
-
 void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
                           int fy_idx, int mask_idx, int tmp_idx) {
   assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
@@ -378,7 +370,7 @@ void VActJitCode::generate() {
     vmovups(ymm_src, ptr[param1 + offset]);
     switch (type_) {
       case operand_type::relu:
-        relu_ymm(ymm_dst, ymm_src, ymm_zero);
+        relu_jmm<ymm_t>(ymm_dst, ymm_src, ymm_zero);
         break;
       case operand_type::exp:
         exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
@@ -414,7 +406,7 @@ void VActJitCode::generate() {
     }
     switch (type_) {
       case operand_type::relu:
-        relu_xmm(xmm_dst, xmm_src, xmm_zero);
+        relu_jmm<xmm_t>(xmm_dst, xmm_src, xmm_zero);
         break;
       case operand_type::exp:
         exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 1467978f26..6adeebca7c 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -128,10 +128,10 @@ class VActJitCode : public JitCode {
 
  protected:
   // compute relu with ymm, xmm
-  void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src,
-                const Xbyak::Ymm& zero);
-  void relu_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src,
-                const Xbyak::Xmm& zero);
+  template <typename JMM>
+  void relu_jmm(JMM& dst, JMM& src, JMM& zero) {  // NOLINT
+    vmaxps(dst, src, zero);
+  }
 
   // compute exp with ymm, xmm
   void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 178298bf56..932fa4c000 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -762,7 +762,7 @@ TEST(JitKernel, vaddrelu) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vadd_ref(d, x_data, y_data, zref_data);
+      vaddrelu_ref(d, x_data, y_data, zref_data);
     }
     auto trefe = GetCurrentUS();
     auto tmkls = GetCurrentUS();

From ccb8963705205eef1f7447be7964dce008c7b997 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 16 Nov 2018 16:54:48 +0000
Subject: [PATCH 27/80] refine exp jitcode with all size

test=develop
---
 paddle/fluid/operators/math/jit_code.cc  | 223 +++--------------------
 paddle/fluid/operators/math/jit_code.h   | 132 +++++++++++++-
 paddle/fluid/operators/math/jit_kernel.h |   1 +
 3 files changed, 153 insertions(+), 203 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 2a10cd7821..fd18256b0c 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/jit_code.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"  // TODO(TJ): remove me
 
 namespace paddle {
 namespace operators {
@@ -111,60 +110,26 @@ void VXXJitCode::generate() {
   ret();
 }
 
-#define ALIGN32 __attribute__((aligned(32)))
-#define EXP_HIG 88.3762626647949f
-#define EXP_LOW -88.3762626647949f
-#define CEPHES_LOG2EF 1.44269504088896341
-#define CEPHES_EXP_C1 0.693359375
-#define CEPHES_EXP_C2 -2.12194440e-4
-#define CEPHES_EXP_P0 1.9875691500E-4
-#define CEPHES_EXP_P1 1.3981999507E-3
-#define CEPHES_EXP_P2 8.3334519073E-3
-#define CEPHES_EXP_P3 4.1665795894E-2
-#define CEPHES_EXP_P4 1.6666665459E-1
-#define CEPHES_EXP_P5 5.0000001201E-1
+const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
+                                          REPEAT_8TIMES(2.f),
+                                          REPEAT_8TIMES(0.5f),
+                                          REPEAT_8TIMES(EXP_HIG),
+                                          REPEAT_8TIMES(EXP_LOW),
+                                          REPEAT_8TIMES(CEPHES_LOG2EF),
+                                          REPEAT_8TIMES(CEPHES_EXP_C1),
+                                          REPEAT_8TIMES(CEPHES_EXP_C2),
+                                          REPEAT_8TIMES(CEPHES_EXP_P0),
+                                          REPEAT_8TIMES(CEPHES_EXP_P1),
+                                          REPEAT_8TIMES(CEPHES_EXP_P2),
+                                          REPEAT_8TIMES(CEPHES_EXP_P3),
+                                          REPEAT_8TIMES(CEPHES_EXP_P4),
+                                          REPEAT_8TIMES(CEPHES_EXP_P5),
+                                          REPEAT_8TIMES(EXP_MAX_INPUT),
+                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
+                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
 
-#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
-
-#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
-
-static const float exp_float_consts[] ALIGN32 = {
-    REPEAT_8TIMES(1.f),
-    REPEAT_8TIMES(2.f),
-    REPEAT_8TIMES(0.5f),
-    REPEAT_8TIMES(EXP_HIG),
-    REPEAT_8TIMES(EXP_LOW),
-    REPEAT_8TIMES(CEPHES_LOG2EF),
-    REPEAT_8TIMES(CEPHES_EXP_C1),
-    REPEAT_8TIMES(CEPHES_EXP_C2),
-    REPEAT_8TIMES(CEPHES_EXP_P0),
-    REPEAT_8TIMES(CEPHES_EXP_P1),
-    REPEAT_8TIMES(CEPHES_EXP_P2),
-    REPEAT_8TIMES(CEPHES_EXP_P3),
-    REPEAT_8TIMES(CEPHES_EXP_P4),
-    REPEAT_8TIMES(CEPHES_EXP_P5),
-    REPEAT_8TIMES(EXP_MAX_INPUT),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
-
-static const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
-static int g_tmp_mem[16] ALIGN32 = {0};
+const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
+int g_tmp_mem[16] ALIGN32 = {0};
 
 bool VActJitCode::init(int d, operand_type type) {
   bool ok = MayIUse(avx);
@@ -177,146 +142,6 @@ bool VActJitCode::init(int d, operand_type type) {
   }
 }
 
-void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
-                          int fy_idx, int mask_idx, int tmp_idx) {
-  assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
-  // check all idx can not equal
-  ymm_t ymm_fx = ymm_t(fx_idx);
-  ymm_t ymm_fy = ymm_t(fy_idx);
-  ymm_t ymm_mask = ymm_t(mask_idx);
-  ymm_t ymm_tmp = ymm_t(tmp_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
-  vminps(ymm_src, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
-  vmaxps(ymm_src, ymm_src, ymm_tmp);
-  // express exp(x) as exp(g + n*log(2))
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
-  vmulps(ymm_fx, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
-  vaddps(ymm_fx, ymm_fx, ymm_tmp);
-  vroundps(ymm_fy, ymm_fx, 0x01);
-  // if greater, substract 1
-  vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
-  vandps(ymm_mask, ymm_mask, ymm_tmp);
-  vsubps(ymm_fx, ymm_fy, ymm_mask);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
-  vmulps(ymm_fy, ymm_fx, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
-  ymm_t ymm_z = ymm_t(ymm_mask.getIdx());
-  vmulps(ymm_z, ymm_fx, ymm_tmp);
-  vsubps(ymm_src, ymm_src, ymm_fy);
-  vsubps(ymm_src, ymm_src, ymm_z);
-  vmulps(ymm_z, ymm_src, ymm_src);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
-  vmulps(ymm_dst, ymm_src, ymm_tmp);
-  for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
-       i += (YMM_FLOAT_BLOCK * sizeof(float))) {
-    vmovaps(ymm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
-    vaddps(ymm_dst, ymm_dst, ymm_tmp);
-    vmulps(ymm_dst, ymm_dst, ymm_src);
-  }
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vmulps(ymm_dst, ymm_dst, ymm_z);
-  vaddps(ymm_dst, ymm_dst, ymm_src);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  // build 2^n
-  ymm_t ymm_int = ymm_fx;
-  vcvttps2dq(ymm_int, ymm_fx);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
-  vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
-  if (MayIUse(avx2)) {
-    vpaddd(ymm_int, ymm_int, ymm_tmp);
-    vpslld(ymm_int, ymm_int, 23);
-  } else if (MayIUse(avx)) {
-    xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
-    xmm_t xtmp2 = xmm_t(ymm_tmp.getIdx());
-    reg64_t reg_ptr_tmp = reg_ptr_global;
-    mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
-    vmovdqa(ptr[reg_ptr_tmp], ymm_int);
-    vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], ymm_tmp);
-    vpaddd(xtmp1, xtmp1, xtmp2);
-    vpslld(xtmp1, xtmp1, 23);
-    vmovdqa(ptr[reg_ptr_tmp], xtmp1);
-    // next 128bits
-    vmovdqa(xtmp1, ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)]);
-    vmovdqa(xtmp2,
-            ptr[reg_ptr_tmp +
-                (YMM_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]);
-    vpaddd(xtmp1, xtmp1, xtmp2);
-    vpslld(xtmp1, xtmp1, 23);
-    vmovdqa(ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)], xtmp1);
-    // load out
-    vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
-  }
-  vmulps(ymm_dst, ymm_dst, ymm_int);
-  pop(reg_ptr_global);
-}
-
-void VActJitCode::exp_xmm(xmm_t& ymm_dst, xmm_t& ymm_src, int fx_idx,
-                          int fy_idx, int mask_idx, int tmp_idx) {
-  assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
-  // check all idx can not equal
-  xmm_t ymm_fx = xmm_t(fx_idx);
-  xmm_t ymm_fy = xmm_t(fy_idx);
-  xmm_t ymm_mask = xmm_t(mask_idx);
-  xmm_t ymm_tmp = xmm_t(tmp_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
-  vminps(ymm_src, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
-  vmaxps(ymm_src, ymm_src, ymm_tmp);
-  // express exp(x) as exp(g + n*log(2))
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
-  vmulps(ymm_fx, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
-  vaddps(ymm_fx, ymm_fx, ymm_tmp);
-  vroundps(ymm_fy, ymm_fx, 0x01);
-  // if greater, substract 1
-  vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
-  vandps(ymm_mask, ymm_mask, ymm_tmp);
-  vsubps(ymm_fx, ymm_fy, ymm_mask);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
-  vmulps(ymm_fy, ymm_fx, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
-  xmm_t ymm_z = xmm_t(ymm_mask.getIdx());
-  vmulps(ymm_z, ymm_fx, ymm_tmp);
-  vsubps(ymm_src, ymm_src, ymm_fy);
-  vsubps(ymm_src, ymm_src, ymm_z);
-  vmulps(ymm_z, ymm_src, ymm_src);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
-  vmulps(ymm_dst, ymm_src, ymm_tmp);
-  for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
-       i += (YMM_FLOAT_BLOCK * sizeof(float))) {
-    vmovaps(ymm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
-    vaddps(ymm_dst, ymm_dst, ymm_tmp);
-    vmulps(ymm_dst, ymm_dst, ymm_src);
-  }
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vmulps(ymm_dst, ymm_dst, ymm_z);
-  vaddps(ymm_dst, ymm_dst, ymm_src);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  // build 2^n
-  xmm_t ymm_int = ymm_fx;
-  vcvttps2dq(ymm_int, ymm_fx);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
-  vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
-  vpaddd(ymm_int, ymm_int, ymm_tmp);
-  vpslld(ymm_int, ymm_int, 23);
-  vmulps(ymm_dst, ymm_dst, ymm_int);
-  pop(reg_ptr_global);
-}
-
 void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
                               int fy_idx, int mask_idx, int tmp_idx) {
   // y = 1 / (1 + e^-x)
@@ -330,7 +155,7 @@ void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
   vmaxps(ymm_src, ymm_src, ymm_tmp);
   vxorps(ymm_tmp, ymm_tmp, ymm_tmp);
   vsubps(ymm_src, ymm_tmp, ymm_src);
-  exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
+  exp_jmm<ymm_t>(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
   vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
   vaddps(ymm_dst, ymm_dst, ymm_tmp);
   vdivps(ymm_dst, ymm_tmp, ymm_dst);
@@ -349,7 +174,7 @@ void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
   vxorps(ymm_zero, ymm_zero, ymm_zero);
   vsubps(ymm_tmp, ymm_zero, ymm_tmp);
   vmulps(ymm_src, ymm_src, ymm_tmp);
-  exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
+  exp_jmm<ymm_t>(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
   vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
   vaddps(ymm_dst, ymm_dst, ymm_tmp);
   vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
@@ -373,7 +198,7 @@ void VActJitCode::generate() {
         relu_jmm<ymm_t>(ymm_dst, ymm_src, ymm_zero);
         break;
       case operand_type::exp:
-        exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        exp_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
         break;
       case operand_type::sigmoid:
         sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
@@ -409,7 +234,7 @@ void VActJitCode::generate() {
         relu_jmm<xmm_t>(xmm_dst, xmm_src, xmm_zero);
         break;
       case operand_type::exp:
-        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
+        exp_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
         break;
       default:
         break;
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 6adeebca7c..534398f4a4 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <string>
 #include "paddle/fluid/operators/math/jit_gen.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -40,6 +42,51 @@ typedef enum {
   identity
 } operand_type;
 
+extern const float exp_float_consts[];
+extern const int exp_int_0x7f[];
+extern int g_tmp_mem[];
+
+// TODO(TJ): move these to some proper place
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+#define XMM_FLOAT_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define ZMM_FLOAT_BLOCK 16
+
+#define ALIGN32 __attribute__((aligned(32)))
+#define EXP_HIG 88.3762626647949f
+#define EXP_LOW -88.3762626647949f
+#define CEPHES_LOG2EF 1.44269504088896341
+#define CEPHES_EXP_C1 0.693359375
+#define CEPHES_EXP_C2 -2.12194440e-4
+#define CEPHES_EXP_P0 1.9875691500E-4
+#define CEPHES_EXP_P1 1.3981999507E-3
+#define CEPHES_EXP_P2 8.3334519073E-3
+#define CEPHES_EXP_P3 4.1665795894E-2
+#define CEPHES_EXP_P4 1.6666665459E-1
+#define CEPHES_EXP_P5 5.0000001201E-1
+
+#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
+
+#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
+
 // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
 class VXXJitCode : public JitCode {
  public:
@@ -134,10 +181,87 @@ class VActJitCode : public JitCode {
   }
 
   // compute exp with ymm, xmm
-  void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
-               int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
-  void exp_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src, int fx_idx = 2,
-               int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  template <typename JMM>
+  void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
+               int mask_idx = 4, int tmp_idx = 5) {
+    using namespace platform::jit;         // NOLINT
+    assert(src.getIdx() != dst.getIdx());  // TODO(TJ): use enfore
+    // check all idx can not equal
+    JMM jmm_fx = JMM(fx_idx);
+    JMM jmm_fy = JMM(fy_idx);
+    JMM jmm_mask = JMM(mask_idx);
+    JMM jmm_tmp = JMM(tmp_idx);
+    reg64_t reg_ptr_global = rax;
+    push(reg_ptr_global);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
+    vminps(src, src, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
+    vmaxps(src, src, jmm_tmp);
+    // express exp(x) as exp(g + n*log(2))
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
+    vmulps(jmm_fx, src, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
+    vaddps(jmm_fx, jmm_fx, jmm_tmp);
+    vroundps(jmm_fy, jmm_fx, 0x01);
+    // if greater, substract 1
+    vcmpgtps(jmm_mask, jmm_fy, jmm_fx);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
+    vandps(jmm_mask, jmm_mask, jmm_tmp);
+    vsubps(jmm_fx, jmm_fy, jmm_mask);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
+    vmulps(jmm_fy, jmm_fx, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
+    JMM ymm_z = JMM(jmm_mask.getIdx());
+    vmulps(ymm_z, jmm_fx, jmm_tmp);
+    vsubps(src, src, jmm_fy);
+    vsubps(src, src, ymm_z);
+    vmulps(ymm_z, src, src);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
+    vmulps(dst, src, jmm_tmp);
+    for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
+         i += (YMM_FLOAT_BLOCK * sizeof(float))) {
+      vmovaps(jmm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
+      vaddps(dst, dst, jmm_tmp);
+      vmulps(dst, dst, src);
+    }
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
+    vaddps(dst, dst, jmm_tmp);
+    vmulps(dst, dst, ymm_z);
+    vaddps(dst, dst, src);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
+    vaddps(dst, dst, jmm_tmp);
+    // build 2^n
+    JMM ymm_int = jmm_fx;
+    vcvttps2dq(ymm_int, jmm_fx);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
+    vmovdqa(jmm_tmp, ptr[reg_ptr_global]);
+    if (MayIUse(avx2) || std::is_same<JMM, xmm_t>::value) {
+      vpaddd(ymm_int, ymm_int, jmm_tmp);
+      vpslld(ymm_int, ymm_int, 23);
+    } else if (MayIUse(avx)) {
+      xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
+      xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx());
+      reg64_t reg_ptr_tmp = reg_ptr_global;
+      mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
+      vmovdqa(ptr[reg_ptr_tmp], ymm_int);
+      vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp);
+      vpaddd(xtmp1, xtmp1, xtmp2);
+      vpslld(xtmp1, xtmp1, 23);
+      vmovdqa(ptr[reg_ptr_tmp], xtmp1);
+      // next 128bits
+      vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]);
+      vmovdqa(xtmp2, ptr[reg_ptr_tmp +
+                         (YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]);
+      vpaddd(xtmp1, xtmp1, xtmp2);
+      vpslld(xtmp1, xtmp1, 23);
+      vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1);
+      // load out
+      vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
+    }
+    vmulps(dst, dst, ymm_int);
+    pop(reg_ptr_global);
+  }
 
   // compute sigmoid with ymm
   void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index 4d8d3cd79a..117baaee2b 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -26,6 +26,7 @@ namespace operators {
 namespace math {
 namespace jitkernel {
 
+// TODO(TJ): move these to some proper place
 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0
 #define EXP_MAX_INPUT 40.0

From 4dbdfa60ef6d13568880fb2de5ee31a469080ab7 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 16 Nov 2018 17:29:36 +0000
Subject: [PATCH 28/80] sigmoid and tanh support all size

test=develop
---
 paddle/fluid/operators/math/jit_code.cc | 67 ++++---------------------
 paddle/fluid/operators/math/jit_code.h  | 50 +++++++++++++++---
 2 files changed, 54 insertions(+), 63 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index fd18256b0c..a080079a2d 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -132,56 +132,8 @@ const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
 int g_tmp_mem[16] ALIGN32 = {0};
 
 bool VActJitCode::init(int d, operand_type type) {
-  bool ok = MayIUse(avx);
-  if (type == operand_type::relu || type == operand_type::exp) {
-    // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
-    return ok;
-  } else {
-    // TODO(TJ): support more
-    return ok && d % 8 == 0;
-  }
-}
-
-void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
-                              int fy_idx, int mask_idx, int tmp_idx) {
-  // y = 1 / (1 + e^-x)
-  ymm_t ymm_tmp = ymm_t(tmp_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
-  vminps(ymm_src, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
-  vmaxps(ymm_src, ymm_src, ymm_tmp);
-  vxorps(ymm_tmp, ymm_tmp, ymm_tmp);
-  vsubps(ymm_src, ymm_tmp, ymm_src);
-  exp_jmm<ymm_t>(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vdivps(ymm_dst, ymm_tmp, ymm_dst);
-  pop(reg_ptr_global);
-}
-
-void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
-                           int fy_idx, int mask_idx, int tmp_idx) {
-  // y = 2 / (1 + e^(-2x)) - 1
-  ymm_t ymm_tmp = ymm_t(tmp_idx);
-  ymm_t ymm_zero = ymm_t(mask_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
-  vxorps(ymm_zero, ymm_zero, ymm_zero);
-  vsubps(ymm_tmp, ymm_zero, ymm_tmp);
-  vmulps(ymm_src, ymm_src, ymm_tmp);
-  exp_jmm<ymm_t>(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
-  vdivps(ymm_dst, ymm_tmp, ymm_dst);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-  vsubps(ymm_dst, ymm_dst, ymm_tmp);
-  pop(reg_ptr_global);
+  // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
+  return MayIUse(avx);
 }
 
 void VActJitCode::generate() {
@@ -201,10 +153,10 @@ void VActJitCode::generate() {
         exp_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
         break;
       case operand_type::sigmoid:
-        sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        sigmoid_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
         break;
       case operand_type::tanh:
-        tanh_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        tanh_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
         break;
       case operand_type::identity:
         break;
@@ -214,11 +166,6 @@ void VActJitCode::generate() {
     vmovups(ptr[param2 + offset], ymm_dst);
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
-  if (type_ != operand_type::relu && type_ != operand_type::exp) {
-    // TODO(TJ): remove me
-    ret();
-    return;
-  }
   int rest = num_ % YMM_FLOAT_BLOCK;
   int block = XMM_FLOAT_BLOCK;
   while (rest > 0) {
@@ -236,6 +183,12 @@ void VActJitCode::generate() {
       case operand_type::exp:
         exp_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
         break;
+      case operand_type::sigmoid:
+        sigmoid_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      case operand_type::tanh:
+        tanh_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
       default:
         break;
     }
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 534398f4a4..65f83ff484 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -263,13 +263,51 @@ class VActJitCode : public JitCode {
     pop(reg_ptr_global);
   }
 
-  // compute sigmoid with ymm
-  void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
-                   int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  // compute sigmoid with ymm, xmm
+  template <typename JMM>
+  void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2,  // NOLINT
+                   int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) {
+    // y = 1 / (1 + e^-x)
+    JMM jmm_tmp = JMM(tmp_idx);
+    reg64_t reg_ptr_global = rax;
+    push(reg_ptr_global);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
+    vminps(src, src, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
+    vmaxps(src, src, jmm_tmp);
+    vxorps(jmm_tmp, jmm_tmp, jmm_tmp);
+    vsubps(src, jmm_tmp, src);
+    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+    vaddps(dst, dst, jmm_tmp);
+    vdivps(dst, jmm_tmp, dst);
+    pop(reg_ptr_global);
+  }
 
-  // compute tanh with ymm
-  void tanh_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
-                int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  // compute tanh with ymm, xmm
+  template <typename JMM>
+  void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
+                int mask_idx = 4, int tmp_idx = 5) {
+    // y = 2 / (1 + e^(-2x)) - 1
+    JMM jmm_tmp = JMM(tmp_idx);
+    JMM jmm_zero = JMM(mask_idx);
+    reg64_t reg_ptr_global = rax;
+    push(reg_ptr_global);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
+    vxorps(jmm_zero, jmm_zero, jmm_zero);
+    vsubps(jmm_tmp, jmm_zero, jmm_tmp);
+    vmulps(src, src, jmm_tmp);
+    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+    vaddps(dst, dst, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
+    vdivps(dst, jmm_tmp, dst);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+    vsubps(dst, dst, jmm_tmp);
+    pop(reg_ptr_global);
+  }
 
  protected:
   int num_;

From be80bb4f28f4a50cfbc96edd790227f59273d20e Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Fri, 16 Nov 2018 20:01:56 +0100
Subject: [PATCH 29/80] - Fix to GPU

test=develop
---
 paddle/fluid/operators/softmax_op.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 91829d5761..8eb5c7691e 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -36,7 +36,9 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
 #ifdef PADDLE_ON_INFERENCE
-    math::SoftmaxFunctor<DeviceContext, T, true>()(
+    math::SoftmaxFunctor<
+        DeviceContext, T,
+        std::is_same<DeviceContext, platform::CPUDeviceContext>::value>()(
         context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
 #else
     math::SoftmaxFunctor<DeviceContext, T, false>()(

From a19b3225a1da8c31fc996bace3ac09e6f5f177ef Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Sat, 17 Nov 2018 14:56:43 +0000
Subject: [PATCH 30/80] fix jitcode small size

test=develop
---
 paddle/fluid/operators/math/jit_code.cc        | 12 ++++++++----
 paddle/fluid/operators/math/jit_kernel_test.cc | 10 +++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index a080079a2d..e484e9a3c7 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -59,9 +59,10 @@ void VXXJitCode::generate() {
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
   int rest = num_ % YMM_FLOAT_BLOCK;
-  int block = XMM_FLOAT_BLOCK;
   while (rest > 0) {
+    int block = XMM_FLOAT_BLOCK;
     if (rest >= 4) {
+      block = 4;
       if (scalar_index_ != 1) {
         vmovups(xmm_src1, ptr[param1 + offset]);
       }
@@ -69,6 +70,7 @@ void VXXJitCode::generate() {
         vmovups(xmm_src2, ptr[param2 + offset]);
       }
     } else if (rest >= 2) {
+      block = 2;
       if (scalar_index_ != 1) {
         vmovq(xmm_src1, ptr[param1 + offset]);
       }
@@ -76,6 +78,7 @@ void VXXJitCode::generate() {
         vmovq(xmm_src2, ptr[param2 + offset]);
       }
     } else {
+      block = 1;
       if (scalar_index_ != 1) {
         vmovss(xmm_src1, ptr[param1 + offset]);
       }
@@ -105,7 +108,6 @@ void VXXJitCode::generate() {
     }
     offset += sizeof(float) * block;
     rest -= block;
-    block /= 2;
   }
   ret();
 }
@@ -167,13 +169,16 @@ void VActJitCode::generate() {
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
   int rest = num_ % YMM_FLOAT_BLOCK;
-  int block = XMM_FLOAT_BLOCK;
   while (rest > 0) {
+    int block = XMM_FLOAT_BLOCK;
     if (rest >= 4) {
+      block = 4;
       vmovups(xmm_src, ptr[param1 + offset]);
     } else if (rest >= 2) {
+      block = 2;
       vmovq(xmm_src, ptr[param1 + offset]);
     } else {
+      block = 1;
       vmovss(xmm_src, ptr[param1 + offset]);
     }
     switch (type_) {
@@ -201,7 +206,6 @@ void VActJitCode::generate() {
     }
     offset += sizeof(float) * block;
     rest -= block;
-    block /= 2;
   }
   ret();
 }
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 932fa4c000..b6c62a2634 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -69,7 +69,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
 
 TEST(JitKernel, vrelu) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+  for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -10.f, 1.f);
@@ -159,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
 
 TEST(JitKernel, vexp) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 12, 15, 16, 20, 30, 128, 256}) {
+  for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -2.f, 2.f);
@@ -234,7 +234,7 @@ void vsigmoid_better(
 
 TEST(JitKernel, vsigmoid) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
+  for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -2.f, 2.f);
@@ -298,7 +298,7 @@ void vtanh_better(
 
 TEST(JitKernel, vtanh) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
+  for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -2.f, 2.f);
@@ -389,7 +389,7 @@ void lstm_ctht_better(
 
 TEST(JitKernel, lstm) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) {
+  for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) {
     int d4 = d * 4;
     int d3 = d * 3;
     std::vector<float> x(d4), xref(d4);

From 9b0eae3023e3faf6a40a69f5ff79bcc2303c674b Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Sun, 18 Nov 2018 13:27:17 +0100
Subject: [PATCH 31/80] - Removing partial specialization of sotmax for
 inference for GPU

test=develop
---
 paddle/fluid/operators/math/softmax.h      |  3 ++-
 paddle/fluid/operators/math/softmax_impl.h | 10 +++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h
index bf698dc2f7..089458e957 100644
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -19,7 +19,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename DeviceContext, typename T, bool is_test>
+template <typename DeviceContext, typename T, bool is_test,
+          typename Enable = void>
 class SoftmaxFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor* X,
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index e09a243347..0f3e5b2008 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -33,8 +33,8 @@ struct ValueClip {
   }
 };
 
-template <typename DeviceContext, typename T, bool is_test>
-void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
+template <typename DeviceContext, typename T, bool is_test, typename Enable>
+void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
     const DeviceContext& context, const framework::Tensor* X,
     framework::Tensor* Y) {
   auto logits = EigenMatrix<T>::From(*X);
@@ -66,8 +66,12 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
                                                  .broadcast(one_by_class));
 }
 
+template <class DeviceContext>
+using enable_if_CPU = typename std::enable_if<
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type;
+
 template <typename DeviceContext>
-class SoftmaxFunctor<DeviceContext, float, true> {
+class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
   void operator()(const DeviceContext& context, const framework::Tensor* X,
                   framework::Tensor* Y) {
     auto in_dims = X->dims();

From 7486b0ddeccb24bec86d3e16a6cf0d86e9fb71c1 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 19 Nov 2018 12:54:37 +0800
Subject: [PATCH 32/80] fix(Mac): fix unittest of macos

test=develop
---
 .gitignore                    | 1 +
 paddle/fluid/pybind/pybind.cc | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index fa0c888260..4f3a304658 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+python/paddle/fluid/tests/unittests/reader_reset_test.recordio
 paddle/operators/check_t.save
 paddle/operators/check_tensor.ls
 paddle/operators/tensor.save
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0d059d8aea..d85480e604 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -357,6 +357,9 @@ All parameter, weight, gradient are variables in Paddle.
              return self.GetMutable<platform::Communicator>();
            },
            py::return_value_policy::reference)
+
+#endif
+#ifndef _WIN32
       .def("get_reader",
            [](Variable &self) -> framework::ReaderHolder * {
              PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
@@ -364,7 +367,7 @@ All parameter, weight, gradient are variables in Paddle.
            },
            py::return_value_policy::reference)
 #endif
-      ;
+      ;  // NOLINT
 
 #if !defined(_WIN32)
   py::class_<framework::ReaderHolder>(m, "Reader", "")

From d36491c28a74e8961fad6f31b64cf5a157114218 Mon Sep 17 00:00:00 2001
From: Wojciech Uss <wojciech.uss@intel.com>
Date: Mon, 19 Nov 2018 05:59:27 +0100
Subject: [PATCH 33/80] add allocator.h copy

The allocator.h header file is required for C-API inference applications

test=develop
---
 cmake/inference_lib.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 729bdcb3dc..7355b67ab1 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -166,8 +166,8 @@ copy(framework_lib DEPS ${framework_lib_deps}
 
 set(module "memory")
 copy(memory_lib
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
         )
 
 set(inference_deps paddle_fluid_shared paddle_fluid)

From e878a8e885ecc6be6b151dbf2f26fadf01abe6da Mon Sep 17 00:00:00 2001
From: Superjomn <yanchunwei@outlook.com>
Date: Mon, 19 Nov 2018 07:19:13 +0000
Subject: [PATCH 34/80] update

test=develop
---
 paddle/fluid/inference/analysis/analyzer_tester.cc         | 2 ++
 .../fluid/inference/analysis/passes/ir_graph_build_pass.h  | 6 +++---
 paddle/fluid/inference/api/CMakeLists.txt                  | 7 +++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 48fc5dda2a..84a0c3374c 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -30,6 +30,7 @@ TEST(Analyzer, analysis_without_tensorrt) {
   Argument argument;
   argument.SetModelDir(FLAGS_inference_model_dir);
   argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
+  argument.SetUseGPU(false);
 
   Analyzer analyser;
   analyser.Run(&argument);
@@ -41,6 +42,7 @@ TEST(Analyzer, analysis_with_tensorrt) {
   argument.SetTensorRtWorkspaceSize(1 << 20);
   argument.SetModelDir(FLAGS_inference_model_dir);
   argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
+  argument.SetUseGPU(false);
 
   Analyzer analyser;
   analyser.Run(&argument);
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
index b0a0b8b75e..271e64fce5 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -17,6 +17,7 @@
 #include <string>
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace inference {
@@ -34,11 +35,10 @@ class IrGraphBuildPass : public AnalysisPass {
  private:
   std::unique_ptr<framework::ProgramDesc> LoadModel(
       const std::string &path, framework::Scope *scope,
-      const boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> &place);
+      const platform::Place &place);
   std::unique_ptr<framework::ProgramDesc> LoadModel(
       const std::string &program_path, const std::string &params_path,
-      framework::Scope *scope,
-      const boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> &place);
+      framework::Scope *scope, const platform::Place &place);
 
   std::string model_binary_str_;
 };
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 82f74a269a..2dc426033b 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -27,11 +27,10 @@ endif()
 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
-cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
-cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
-
+cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc

From a5249385a354df7dd8b28765f2dd7a7e12d679af Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 19 Nov 2018 16:26:58 +0800
Subject: [PATCH 35/80] Fix ssl and yum install problem

test=develop
---
 tools/manylinux1/build_scripts/build.sh       | 11 +++++------
 tools/manylinux1/build_scripts/build_utils.sh |  9 ++++-----
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index ace0bebd9d..6c551eceb4 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -13,8 +13,8 @@ CPYTHON_VERSIONS="3.7.0 3.6.0 3.5.1 2.7.11"
 
 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
-OPENSSL_ROOT=openssl-1.0.2l
-OPENSSL_HASH=ce07195b659e75f4e1db43552860070061f156a98bb37b672b101ba6e3ddf30c
+OPENSSL_ROOT=openssl-1.1.0i
+OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
 EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
 DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
 PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
@@ -61,7 +61,7 @@ yum -y install bzip2 make git patch unzip bison yasm diffutils \
 
 wget -q https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz && tar xzf cmake-3.5.2.tar.gz && \
 cd cmake-3.5.2 && ./bootstrap && \
-make -j4 && make install && cd .. && rm cmake-3.5.2.tar.gz
+make -j8 && make install && cd .. && rm cmake-3.5.2.tar.gz
 
 
 # Install newest autoconf
@@ -121,9 +121,8 @@ ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
 # final image
 yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
     avahi freetype bitstream-vera-fonts \
-    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1
-yum -y install ${MANYLINUX1_DEPS}
-yum -y clean all > /dev/null 2>&1
+    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1 || true
+yum -y install ${MANYLINUX1_DEPS} && yum -y clean all > /dev/null 2>&1 || true
 yum list installed
 # we don't need libpython*.a, and they're many megabytes
 find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
index 942ca2b0f1..c1647ce244 100755
--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -52,11 +52,13 @@ function do_cpython_build {
 
     # NOTE --enable-shared for generating libpython shared library needed for
     # linking of some of the nupic.core test executables.
-    CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
-    make -j8 > /dev/null
     if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
+        CFLAGS="-Wformat" ./configure --prefix=${prefix} --with-openssl=/usr/local/ssl --enable-shared $unicode_flags > /dev/null
+        make -j8 > /dev/null
         make altinstall > /dev/null
     else
+        CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+        make -j8 > /dev/null
         make install > /dev/null
     fi
     popd
@@ -68,9 +70,6 @@ function do_cpython_build {
     if [ -e ${prefix}/bin/python3 ]; then
         ln -s python3 ${prefix}/bin/python
     fi
-    if [ -e ${prefix}/bin/python3.6 ]; then
-        ln -s python3.6 ${prefix}/bin/python
-    fi
     if [ -e ${prefix}/bin/python3.7 ]; then
         ln -s python3.7 ${prefix}/bin/python
     fi

From a8c077df7c1cfbe9d902c3a917acb631eaae5e9b Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Mon, 19 Nov 2018 13:10:05 +0000
Subject: [PATCH 36/80] Implement leaky relu tensorRT converter

---
 .../passes/ir_analysis_compose_pass.cc        |  3 +-
 .../fluid/inference/api/analysis_predictor.cc |  1 +
 .../inference/tensorrt/convert/CMakeLists.txt |  4 +-
 .../tensorrt/convert/leaky_relu_op.cc         | 93 +++++++++++++++++++
 .../tensorrt/convert/test_leaky_relu_op.cc    | 48 ++++++++++
 .../inference/tensorrt/plugin/CMakeLists.txt  |  2 +-
 6 files changed, 148 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc

diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
index 38e9b1c5e7..3e89ad0792 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
       std::unordered_set<std::string> teller_set(
           {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
            "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"});
+           "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose",
+           "leaky_relu"});
       if (!node->IsOp()) return false;
 
       if (teller_set.count(node->Op()->Type())) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d19505877b..ee1d1d839c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -551,4 +551,5 @@ USE_TRT_CONVERTER(pad);
 USE_TRT_CONVERTER(split);
 USE_TRT_CONVERTER(prelu);
 USE_TRT_CONVERTER(conv2d_transpose);
+USE_TRT_CONVERTER(leaky_relu);
 #endif
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 85ad5ffe78..c0d6affae7 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -2,7 +2,7 @@
 nv_library(tensorrt_converter
   SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-pad_op.cc split_op.cc prelu_op.cc
+pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc
   DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -37,3 +37,5 @@ nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
 nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
         prelu_op SERIAL)
+nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
new file mode 100644
index 0000000000..810295e191
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// LeakyRelu converter from fluid to tensorRT
+class LeakyReluOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid leaky_relu op to tensorrt layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    int input_num = op_desc.Input("X").size();
+    PADDLE_ENFORCE(input_num == 1);
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    // Get output
+    size_t output_num = op_desc.Output("Out").size();
+    PADDLE_ENFORCE(output_num == 1);
+    // Get attrs
+    float alpha = boost::get<float>(op_desc.GetAttr("alpha"));
+
+    platform::CPUPlace place;
+    std::unique_ptr<framework::LoDTensor> alpha_tensor(
+        new framework::LoDTensor());
+    alpha_tensor->Resize(framework::make_ddim({2}));
+    float* alpha_data = alpha_tensor->mutable_data<float>(place);
+    alpha_data[0] = alpha;
+    alpha_data[1] = 1.f - alpha;
+    // the leaky relu formula y = (x > 0) ? x : alpha * x is equal to
+    // y = alpha * x + (x > 0) ? (1 - alpha) * x : 0
+    TensorRTEngine::Weight scale{nvinfer1::DataType::kFLOAT, &alpha_data[0], 1};
+    TensorRTEngine::Weight shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    TensorRTEngine::Weight power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    // y_scale = alpha * x
+    auto* scale_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM, shift.get(),
+        scale.get(), power.get());
+    PADDLE_ENFORCE(nullptr != scale_layer);
+    // y_relu = (x > 0) : x : 0
+    auto* relu_layer = TRT_ENGINE_ADD_LAYER(engine_, Activation, *input,
+                                            nvinfer1::ActivationType::kRELU);
+    PADDLE_ENFORCE(nullptr != relu_layer);
+    //
+    TensorRTEngine::Weight sub_scale{nvinfer1::DataType::kFLOAT, &alpha_data[1],
+                                     1};
+    auto* scale_relu_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Scale, *(relu_layer->getOutput(0)),
+                             nvinfer1::ScaleMode::kUNIFORM, shift.get(),
+                             sub_scale.get(), power.get());
+    PADDLE_ENFORCE(nullptr != scale_relu_layer);
+    auto* output_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *(scale_layer->getOutput(0)),
+                             *(scale_relu_layer->getOutput(0)),
+                             nvinfer1::ElementWiseOperation::kSUM);
+    PADDLE_ENFORCE(nullptr != output_layer);
+    // keep alpha tensor to avoid release it's memory
+    engine_->weight_map[op_desc.Input("alpha")[0]] = std::move(alpha_tensor);
+
+    std::string layer_name = "leaky_relu (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    output_layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, output_layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    output_layer->setName((layer_name + ")").c_str());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(leaky_relu, LeakyReluOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
new file mode 100644
index 0000000000..6fcf78abe4
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(leaky_relu_op, test_channel_wise) {
+  std::unordered_set<std::string> parameters({"leaky_relu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("leaky_relu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclOutputVar("leaky_relu_out", nvinfer1::DimsCHW(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("leaky_relu");
+  desc.SetInput("X", {"leaky_relu_input"});
+  desc.SetOutput("Out", {"leaky_relu_out"});
+
+  desc.SetAttr("alpha", 0.1f);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+// USE_OP(leaky_relu);
+USE_OP(leaky_relu);
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index b6811f9183..190310ac46 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1 +1 @@
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context)
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce tensorrt_engine)

From 1622cb99372d8c41eede080220315ac165feb870 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Mon, 19 Nov 2018 13:34:14 +0000
Subject: [PATCH 37/80] Fix alpha tensor key

---
 paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc     | 5 ++++-
 .../fluid/inference/tensorrt/convert/test_leaky_relu_op.cc   | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index 810295e191..b3244ef84d 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -72,7 +72,10 @@ class LeakyReluOpConverter : public OpConverter {
                              nvinfer1::ElementWiseOperation::kSUM);
     PADDLE_ENFORCE(nullptr != output_layer);
     // keep alpha tensor to avoid release it's memory
-    engine_->weight_map[op_desc.Input("alpha")[0]] = std::move(alpha_tensor);
+    std::string alpha_name = op_desc.Output("Out")[0] + "_alpha";
+    PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) ==
+                   engine_->weight_map.end());
+    engine_->weight_map[alpha_name] = std::move(alpha_tensor);
 
     std::string layer_name = "leaky_relu (Output: ";
     auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
index 6fcf78abe4..d00826af07 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
@@ -20,8 +20,8 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(leaky_relu_op, test_channel_wise) {
-  std::unordered_set<std::string> parameters({"leaky_relu_alpha"});
+TEST(leaky_relu_op, test_leaky_relu) {
+  std::unordered_set<std::string> parameters;
   framework::Scope scope;
   TRTConvertValidation validator(10, parameters, scope, 1000);
   validator.DeclInputVar("leaky_relu_input", nvinfer1::DimsCHW(3, 2, 2));

From 6a017d9abe10c2c46533a757ed9f2f9c05489c87 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 19 Nov 2018 21:54:35 +0800
Subject: [PATCH 38/80] Remove numpy's requirements or python3.7 will not be
 supported

test=develop
---
 python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/requirements.txt b/python/requirements.txt
index 84cf440397..e56d0f811c 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,5 +1,5 @@
 requests==2.9.2
-numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
+numpy>=1.12 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
 protobuf==3.1
 recordio>=0.1.0
 matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib

From a2fce6daf253d29ca47dd61a50fda63a92440943 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 19 Nov 2018 22:00:28 +0800
Subject: [PATCH 39/80] Polish code

test=develop
---
 python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/requirements.txt b/python/requirements.txt
index e56d0f811c..f41b2e13f0 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,5 +1,5 @@
 requests==2.9.2
-numpy>=1.12 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
+numpy>=1.12 #TODO:change to ">=1.12" to support python3.7
 protobuf==3.1
 recordio>=0.1.0
 matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib

From be50670348a23b35172e2420baeb058321ab3e13 Mon Sep 17 00:00:00 2001
From: Yihua Xu <yihuax.xu@intel.com>
Date: Tue, 20 Nov 2018 08:24:00 +0800
Subject: [PATCH 40/80] Remove the remnant code (test=develop)

---
 paddle/fluid/operators/stack_op.h | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index f1692ae956..56a12852a9 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -72,25 +72,6 @@ class StackOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-template <typename VecXType, typename T>
-struct StackFunctor {
-  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
-      : x_(x), y_(y), n_(n), post_(post) {}
-
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    y_[idx] = x_[which_x][x_index];
-  }
-
- private:
-  VecXType x_;
-  T *y_;
-  int n_;
-  int post_;
-};
-
 template <typename VecDxType, typename T>
 struct StackGradFunctor {
   HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
@@ -110,14 +91,6 @@ struct StackGradFunctor {
   int post_;
 };
 
-template <typename DeviceContext, typename VecXType, typename T>
-static inline void StackFunctorForRange(const DeviceContext &ctx,
-                                        const VecXType &x, T *y, int total_num,
-                                        int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackFunctor<VecXType, T>(x, y, n, post));
-}
-
 template <typename DeviceContext, typename VecDxType, typename T>
 static inline void StackGradFunctorForRange(const DeviceContext &ctx,
                                             const VecDxType &dx, const T *dy,

From d91740acb1e49e4baaad02aeda379f27f6ec0f69 Mon Sep 17 00:00:00 2001
From: Yihua Xu <yihuax.xu@intel.com>
Date: Tue, 20 Nov 2018 08:25:48 +0800
Subject: [PATCH 41/80] Revert "Remove the remnant code (test=develop)"

This reverts commit be50670348a23b35172e2420baeb058321ab3e13.
---
 paddle/fluid/operators/stack_op.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index 56a12852a9..f1692ae956 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -72,6 +72,25 @@ class StackOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+template <typename VecXType, typename T>
+struct StackFunctor {
+  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
+      : x_(x), y_(y), n_(n), post_(post) {}
+
+  HOSTDEVICE void operator()(int idx) {
+    int i = idx / (n_ * post_);
+    int which_x = idx / post_ - i * n_;
+    int x_index = i * post_ + idx % post_;
+    y_[idx] = x_[which_x][x_index];
+  }
+
+ private:
+  VecXType x_;
+  T *y_;
+  int n_;
+  int post_;
+};
+
 template <typename VecDxType, typename T>
 struct StackGradFunctor {
   HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
@@ -91,6 +110,14 @@ struct StackGradFunctor {
   int post_;
 };
 
+template <typename DeviceContext, typename VecXType, typename T>
+static inline void StackFunctorForRange(const DeviceContext &ctx,
+                                        const VecXType &x, T *y, int total_num,
+                                        int n, int post) {
+  platform::ForRange<DeviceContext> for_range(ctx, total_num);
+  for_range(StackFunctor<VecXType, T>(x, y, n, post));
+}
+
 template <typename DeviceContext, typename VecDxType, typename T>
 static inline void StackGradFunctorForRange(const DeviceContext &ctx,
                                             const VecDxType &dx, const T *dy,

From a906a361be831b9b425a9f197036fef506020857 Mon Sep 17 00:00:00 2001
From: Yihua Xu <yihuax.xu@intel.com>
Date: Tue, 20 Nov 2018 08:30:27 +0800
Subject: [PATCH 42/80] Add the macro for NVCC (test=develop)

---
 paddle/fluid/operators/stack_op.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index f1692ae956..3d132e4397 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -149,11 +149,20 @@ class StackKernel : public framework::OpKernel<T> {
     for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
 
 #ifdef __NVCC__
+    int total_num = pre * n * post;
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+
     thrust::device_vector<const T *> device_x_vec(x_datas);
     auto x_data_arr = device_x_vec.data().get();
+
+    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
+
+    // Wait() must be called because device_x_vec may be destructed before
+    // kernel ends
+    dev_ctx.Wait();
 #else
     auto x_data_arr = x_datas.data();
-#endif
+
     size_t x_offset = 0;
     size_t y_offset = 0;
     for (int i = 0; i < pre; i++) {
@@ -164,10 +173,6 @@ class StackKernel : public framework::OpKernel<T> {
       }
       x_offset += post;
     }
-#ifdef __NVCC__
-    // Wait() must be called because device_x_vec may be destructed before
-    // kernel ends
-    dev_ctx.Wait();
 #endif
   }
 };

From a94a7355f0014337006ea8bb04bb2c30c955f7ea Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Tue, 20 Nov 2018 10:01:51 +0800
Subject: [PATCH 43/80] Refine the GraphNum check  (#14144)

* refine GraphCheck
test=develop

* fix ci fail
test=develop
---
 paddle/fluid/framework/ir/graph_helper.cc   | 28 +++++++++++++++------
 paddle/fluid/framework/parallel_executor.cc | 13 ++++++++--
 python/paddle/fluid/__init__.py             |  3 ++-
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 98112c1ed3..963179192f 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -15,8 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include <algorithm>
 #include <deque>
+#include <fstream>
+#include <iosfwd>
+#include <ostream>
 #include <unordered_set>
 
+DEFINE_string(print_sub_graph_dir, "",
+              "FLAGS_print_sub_graph_dir is used "
+              "to print the nodes of sub_graphs.");
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) {
     graph_nodes.emplace_back(g_nodes);
   }
 
-  if (VLOG_IS_ON(100)) {
-    VLOG(100) << "graph_num: " << graph_nodes.size();
-    for (auto &g_n : graph_nodes) {
-      VLOG(100) << "graph_nodes: " << g_n.size();
-      if (g_n.size() < 10) {
-        std::stringstream out;
+  if (FLAGS_print_sub_graph_dir.size()) {
+    if (graph_nodes.size() > 1) {
+      std::stringstream out;
+      for (auto &g_n : graph_nodes) {
+        out << "graph_nodes: " << g_n.size() << "\n";
+      }
+      out << "\n\n";
+      for (auto &g_n : graph_nodes) {
+        out << "graph_nodes: " << g_n.size();
         for (auto &node : g_n) {
           out << "\nNode: " << node->Name() << " in [";
           for (auto &n : node->inputs) {
@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) {
           }
           out << "]";
         }
-        VLOG(100) << out.str();
+        out << "\n\n\n";
       }
+      std::unique_ptr<std::ostream> fout(
+          new std::ofstream(FLAGS_print_sub_graph_dir));
+      PADDLE_ENFORCE(fout->good());
+      *fout << out.str();
     }
   }
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 39b47415ff..2c6e337568 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor(
   }
   // If the loss_var_name is given, the number of graph should be only one.
   if (loss_var_name.size()) {
-    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
-                      "The number of graph should be only one");
+    size_t graph_num = ir::GraphNum(*graph);
+    if (graph_num > 1) {
+      LOG(WARNING)
+          << "The number of graph should be only one, "
+             "but the current graph has "
+          << ir::GraphNum(*graph)
+          << " sub_graphs. If you want to see the nodes of the "
+             "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
+             "to specify the output dir. NOTES: if you not do training, "
+             "please don't pass loss_var_name.";
+    }
   }
 
   if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index b991974928..f2f49f813a 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -116,7 +116,8 @@ def __bootstrap__():
         'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
         "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb',
-        'allocator_strategy', 'reader_queue_speed_test_mode'
+        'allocator_strategy', 'reader_queue_speed_test_mode',
+        'print_sub_graph_dir'
     ]
     if os.name != 'nt':
         read_env_flags.append('warpctc_dir')

From bb2b35c85ebe726fa6baa94f466f65a71b21394e Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 19 Nov 2018 21:11:12 +0800
Subject: [PATCH 44/80] Add python example for resize_nearest. test=develop

---
 python/paddle/fluid/layers/nn.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index af96f5de4f..91599b156d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5788,7 +5788,7 @@ def image_resize(input,
     Examples:
         .. code-block:: python
 
-            out = fluid.layers.image_resize(input, out_shape=[12, 12])
+            out = fluid.layers.image_resize(input, out_shape=[12, 12], resample="NEAREST")
     """
     resample_methods = {
         'BILINEAR': 'bilinear',
@@ -5891,6 +5891,11 @@ def resize_bilinear(input,
 
     Returns:
         ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
     """
 
     return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
@@ -5937,6 +5942,11 @@ def resize_nearest(input,
 
     Returns:
         ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            out = fluid.layers.resize_nearest(input, out_shape=[12, 12])
     """
 
     return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape)

From 8bc1c5d2abb260ab4c20e009ceacb8508b8ae59d Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Tue, 20 Nov 2018 11:10:38 +0800
Subject: [PATCH 45/80] Implement the Tensorrt plugin for elementwise op
 (#14487)

* Initialize the elementwise plugin.

* Implement the basic CUDA kernel of elementwise plugin.
test=develop
---
 .../ir_passes/tensorrt_subgraph_pass.cc       |   2 +-
 .../passes/ir_analysis_compose_pass.cc        |   3 +-
 .../inference/tensorrt/convert/CMakeLists.txt |  13 +-
 .../tensorrt/convert/elementwise_op.cc        |  70 ++++++---
 .../inference/tensorrt/convert/op_converter.h |   2 +-
 .../inference/tensorrt/convert/prelu_op.cc    |   2 +-
 .../inference/tensorrt/convert/split_op.cc    |   2 +-
 .../tensorrt/convert/test_elementwise_op.cc   |  78 +++++++---
 .../inference/tensorrt/convert/test_mul_op.cc |  18 +--
 .../inference/tensorrt/convert/ut_helper.h    |   2 +-
 paddle/fluid/inference/tensorrt/engine.cc     |   5 +-
 paddle/fluid/inference/tensorrt/engine.h      |   4 +-
 .../inference/tensorrt/plugin/CMakeLists.txt  |   4 +-
 .../tensorrt/plugin/elementwise_op_plugin.cu  | 138 ++++++++++++++++++
 .../tensorrt/plugin/elementwise_op_plugin.h   |  87 +++++++++++
 .../tensorrt/plugin/prelu_op_plugin.cu        |   2 +
 .../tensorrt/plugin/prelu_op_plugin.h         |   2 +
 .../inference/tensorrt/plugin/serialize.h     |  32 +++-
 .../tensorrt/plugin/split_op_plugin.cu        |  25 ++--
 .../tensorrt/plugin/split_op_plugin.h         |  73 +++++----
 .../inference/tensorrt/plugin/trt_plugin.cc   |  28 ++--
 .../inference/tensorrt/plugin/trt_plugin.h    |  72 ++++++---
 .../fluid/inference/tests/api/tester_helper.h |   2 +-
 23 files changed, 500 insertions(+), 166 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h

diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 21fd8d2df4..c6b7c05f78 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   // it is either an OP's input or an OP's output.
 
   auto &subgraph_nodes = *Agent(node).subgraph();
-  for (size_t index = 0; index < block_desc.OpSize(); index++) {
+  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
     framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
     auto correspond_node = subgraph_nodes[index];
     PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
index 38e9b1c5e7..267737e95c 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
       std::unordered_set<std::string> teller_set(
           {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
            "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"});
+           "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
+           "conv2d_transpose"});
       if (!node->IsOp()) return false;
 
       if (teller_set.count(node->Op()->Type())) {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 85ad5ffe78..8dd6e8453f 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,9 +1,9 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-pad_op.cc split_op.cc prelu_op.cc
-  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
+           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
+                pad_op.cc split_op.cc prelu_op.cc
+           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
   ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
@@ -20,7 +20,8 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
-        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine elementwise_add_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+             elementwise_add_op elementwise_mul_op SERIAL)
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL)
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
@@ -33,7 +34,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL)
 nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-        split_op concat_op SERIAL)
+             split_op concat_op SERIAL)
 nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
         prelu_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 1af091fabd..6975086193 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,11 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+static bool CheckDims(const nvinfer1::Dims& dims_x,
+                      const nvinfer1::Dims& dims_y) {
+  if (dims_x.nbDims != dims_y.nbDims) {
+    return false;
+  }
+  for (int i = 0; i < dims_x.nbDims; i++) {
+    if (dims_x.d[i] != dims_y.d[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
 class ElementwiseWeightOpConverter : public OpConverter {
  public:
   ElementwiseWeightOpConverter() {}
@@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+    VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
 
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter {
   ElementwiseTensorOpConverter() {}
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
+    auto op_pair = ops.find(op_type_);
+    PADDLE_ENFORCE(op_pair != ops.end(), "Wrong elementwise op type!");
+
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
 
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter {
     nvinfer1::Dims dims_x = X->getDimensions();
     nvinfer1::Dims dims_y = Y->getDimensions();
 
-    // The two input tensor should have the same dims
-    PADDLE_ENFORCE(dims_x.nbDims >= 3);
-    if (dims_x.nbDims == dims_y.nbDims) {
-      for (int i = 0; i < dims_x.nbDims; i++) {
-        if (dims_x.d[i] != dims_y.d[i])
-          PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
-      }
-    } else {
-      PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
-    }
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
+    auto output_name = op_desc.Output("Out")[0];
+    if (CheckDims(dims_x, dims_y)) {
+      // The two input tensor should have the same dims
+      VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
 
-    auto op_pair = ops.find(op_type_);
-    if (op_pair == ops.end()) {
-      PADDLE_THROW("Wrong elementwise op type!");
-    }
-    nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
-        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
+      nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
+          engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
+          *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
 
-    auto output_name = op_desc.Output("Out")[0];
-    layer->setName(("elementwise (Output: " + output_name + ")").c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+    } else {
+      VLOG(3) << "Convert a fluid elementwise op to TensorRT "
+                 "ElementWisePluginLayer";
+
+      plugin::ElementWisePlugin* plugin =
+          new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
+      plugin->AddInput(X);
+      plugin->AddInput(Y);
+      nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
+          const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
+          reinterpret_cast<plugin::PluginTensorRT*>(plugin));
+
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+    }
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
       engine_->DeclareOutput(output_name);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index d309d94c56..d61d635ed7 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -61,7 +61,7 @@ class OpConverter {
       // TODO(xingzhaolong): all mul, sub, div
       // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
       // "sub", "div"};
-      static std::unordered_set<std::string> add_weight_op_set{"add"};
+      static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
       PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
       int op_type_len = op_desc.Type().size();
       std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 337885e6ba..dbdff85dde 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -54,7 +54,7 @@ class PReluOpConverter : public OpConverter {
     TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
                                     static_cast<void*>(alpha_data),
                                     alpha_tensor_device->numel());
-    PReluPlugin* plugin = new PReluPlugin(alpha_rt, mode);
+    plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
     nvinfer1::IPluginLayer* layer =
         engine_->AddPlugin(&input, input_num, plugin);
     // keep alpha tensor to avoid release it's memory
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 159854ab59..6620c76318 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -50,7 +50,7 @@ class SplitOpConverter : public OpConverter {
     PADDLE_ENFORCE(output_lengths.size() == output_num);
 
     //
-    SplitPlugin* plugin = new SplitPlugin(axis, output_lengths);
+    plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
     nvinfer1::IPluginLayer* layer =
         engine_->AddPlugin(&input, input_num, plugin);
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
index 7537d02a35..cc967464a5 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -20,13 +20,12 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(elementwise_op, add_weight_test) {
+TEST(elementwise_op, add_weight) {
   std::unordered_set<std::string> parameters({"elementwise_add-Y"});
   framework::Scope scope;
   TRTConvertValidation validator(10, parameters, scope, 1 << 15);
   validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
   validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
   validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
 
   // Prepare Op description
@@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) {
   validator.Execute(8);
 }
 
-TEST(elementwise_op, add_tensor_test) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  TRTConvertValidation validator(8, parameters, scope, 1 << 15);
-  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
-  validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
-  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("elementwise_add");
-  desc.SetInput("X", {"elementwise_add-X"});
-  desc.SetInput("Y", {"elementwise_add-Y"});
-  desc.SetOutput("Out", {"elementwise_add-Out"});
-
-  // the defalut axis of elementwise op is -1
-
-  validator.SetOp(*desc.Proto());
+TEST(elementwise_op, native) {
+  for (std::string type : {"add", "mul"}) {
+    int batch_size = 8;
+    std::unordered_set<std::string> parameters;
+    framework::Scope scope;
+    TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
+    validator.DeclInputVar("elementwise_" + type + "-X",
+                           nvinfer1::DimsCHW(10, 3, 3));
+    validator.DeclInputVar("elementwise_" + type + "-Y",
+                           nvinfer1::Dims3(10, 3, 3));
+    validator.DeclOutputVar("elementwise_" + type + "-Out",
+                            nvinfer1::DimsCHW(10, 3, 3));
+
+    // Prepare Op description
+    framework::OpDesc desc;
+    desc.SetType("elementwise_" + type);
+    desc.SetInput("X", {"elementwise_" + type + "-X"});
+    desc.SetInput("Y", {"elementwise_" + type + "-Y"});
+    desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
+
+    int axis = -1;
+    desc.SetAttr("axis", axis);
+
+    validator.SetOp(*desc.Proto());
+    validator.Execute(batch_size);
+  }
+}
 
-  validator.Execute(8);
+TEST(elementwise_op, plugin) {
+  for (std::string type : {"add", "mul"}) {
+    int batch_size = 8;
+    std::unordered_set<std::string> parameters;
+    framework::Scope scope;
+    TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
+    validator.DeclInputVar("elementwise_" + type + "-X",
+                           nvinfer1::DimsCHW(10, 3, 3));
+    validator.DeclInputVar("elementwise_" + type + "-Y",
+                           nvinfer1::Dims3(10, 1, 1));
+    validator.DeclOutputVar("elementwise_" + type + "-Out",
+                            nvinfer1::DimsCHW(10, 3, 3));
+
+    // Prepare Op description
+    framework::OpDesc desc;
+    desc.SetType("elementwise_" + type);
+    desc.SetInput("X", {"elementwise_" + type + "-X"});
+    desc.SetInput("Y", {"elementwise_" + type + "-Y"});
+    desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
+
+    int axis = -1;
+    desc.SetAttr("axis", axis);
+
+    validator.SetOp(*desc.Proto());
+    validator.Execute(batch_size);
+  }
 }
 
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
 USE_OP(elementwise_add);
+USE_OP(elementwise_mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index 3d34cd7d5d..282f53559a 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 0a6f171fc4..f313beb73b 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 208bd12b83..f739752cbc 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -257,9 +257,10 @@ void TensorRTEngine::freshDeviceId() {
 }
 
 nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
-    nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
+    nvinfer1::ITensor *const *inputs, int num_inputs,
+    plugin::PluginTensorRT *plugin) {
   owned_plugin_.emplace_back(plugin);
-  return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
+  return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 99420f19ba..f5b2c28ba9 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -128,7 +128,7 @@ class TensorRTEngine : public EngineBase {
   int GetRuntimeBatch();
   int GetDevice() { return device_; }
   nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
-                                    int nbInputs, PluginTensorRT*);
+                                    int num_inputs, plugin::PluginTensorRT*);
 
   // A pointer to CPU memory is needed of the TRT weight.
   // Before TRT runs, fluid loads weight into GPU storage.
@@ -171,7 +171,7 @@ class TensorRTEngine : public EngineBase {
 
   // The specific GPU id that the TensorRTEngine bounded to.
   int device_;
-  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugin_;
+  std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
 
   // TensorRT related internal members
   template <typename T>
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index b6811f9183..4090269499 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1 +1,3 @@
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context)
+nv_library(tensorrt_plugin
+           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
+           DEPS enforce device_context)
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
new file mode 100644
index 0000000000..9cd9026b73
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -0,0 +1,138 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+namespace details {
+
+template <typename T>
+struct Add {
+  __device__ T operator()(const T& a, const T& b) const { return a + b; }
+};
+
+template <typename T>
+struct Mul {
+  __device__ T operator()(const T& a, const T& b) const { return a * b; }
+};
+
+template <typename T, typename Operator>
+__global__ void ColumnWiseKernel(Operator op, const T* x, const T* y, T* out,
+                                 int batch_size, int num_rows, int num_cols) {
+  for (int batch_id = 0; batch_id < batch_size; ++batch_id) {
+    int row = blockIdx.x;
+    for (; row < num_rows; row += gridDim.x) {
+      T value_y = y[batch_id * num_rows + row];
+      int col = threadIdx.x;
+      int offset = (batch_id * num_rows + row) * num_cols;
+      for (; col < num_cols; col += blockDim.x) {
+        T value_x = x[offset + col];
+        out[offset + col] = op(value_x, value_y);
+      }
+    }
+  }
+}
+
+template <typename T, typename Operator>
+static void ElementWise(Operator op, const T* x, const T* y, T* out,
+                        int batch_size, int prev, int midd, int post,
+                        cudaStream_t stream) {
+  const int kThreadsPerBlock = 1024;
+  const int kMaximumBlocks = 65535;
+  if (prev == 1) {
+    int num_threads = (post > kThreadsPerBlock) ? kThreadsPerBlock
+                                                : (((post + 31) >> 5) << 5);
+    int num_blocks = (midd < kMaximumBlocks) ? midd : kMaximumBlocks;
+    ColumnWiseKernel<<<num_blocks, num_threads, 0, stream>>>(
+        op, x, y, out, batch_size, midd, post);
+  } else if (post == 1) {
+    PADDLE_THROW("Not implemented.");
+  } else {
+    PADDLE_THROW("Not implemented.");
+  }
+}
+
+}  // namespace details
+
+nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
+  PADDLE_ENFORCE_EQ(index, 0);
+  PADDLE_ENFORCE_EQ(num_inputs, 2);
+  PADDLE_ENFORCE_NOT_NULL(input_dims);
+  return input_dims[0];
+}
+
+int ElementWisePlugin::initialize() {
+  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
+
+  axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
+  int trimed_nb_dims = dims_y_.nbDims;
+  for (; trimed_nb_dims > 0; --trimed_nb_dims) {
+    if (dims_y_.d[trimed_nb_dims - 1] != 1) {
+      break;
+    }
+  }
+  dims_y_.nbDims = trimed_nb_dims;
+
+  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
+  PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
+
+  prev_size_ = 1;
+  midd_size_ = 1;
+  post_size_ = 1;
+  for (int i = 0; i < axis_; ++i) {
+    prev_size_ *= dims_x_.d[i];
+  }
+
+  for (int i = 0; i < dims_y_.nbDims; ++i) {
+    PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
+                      "Broadcast dimension mismatch.");
+    midd_size_ *= dims_y_.d[i];
+  }
+
+  for (int i = axis_ + dims_y_.nbDims; i < dims_x_.nbDims; ++i) {
+    post_size_ *= dims_x_.d[i];
+  }
+  return 0;
+}
+
+int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
+                               void** outputs, void* workspace,
+                               cudaStream_t stream) {
+  const float* x = reinterpret_cast<const float*>(inputs[0]);
+  const float* y = reinterpret_cast<const float*>(inputs[1]);
+  float* out = reinterpret_cast<float*>(outputs[0]);
+
+  if (type_ == nvinfer1::ElementWiseOperation::kSUM) {
+    details::ElementWise(details::Add<float>(), x, y, out, batch_size,
+                         prev_size_, midd_size_, post_size_, stream);
+  } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) {
+    details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
+                         prev_size_, midd_size_, post_size_, stream);
+  } else {
+    PADDLE_THROW("Not implemented.");
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
new file mode 100644
index 0000000000..9c461f7a5c
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class ElementWisePlugin : public PluginTensorRT {
+ public:
+  ElementWisePlugin(nvinfer1::ElementWiseOperation type,
+                    nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y,
+                    int axis)
+      : type_(type),
+        dims_x_(dims_x),
+        dims_y_(dims_y),
+        axis_(axis),
+        prev_size_(1),
+        midd_size_(1),
+        post_size_(1) {}
+
+  ElementWisePlugin(void const *serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
+    DeserializeValue(&serial_data, &serial_length, &axis_);
+    DeserializeValue(&serial_data, &serial_length, &dims_x_);
+    DeserializeValue(&serial_data, &serial_length, &dims_y_);
+  }
+
+  ElementWisePlugin *clone() const override {
+    // return new ElementWisePlugin(dims_x_, dims_y_, axis_);
+    return nullptr;
+  }
+
+  const char *getPluginType() const override { return "elementwise"; }
+
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims *input_dims,
+                                     int num_inputs) override;
+
+  int initialize() override;
+
+  // execute the layer
+  int enqueue(int batch_size, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream);
+
+ protected:
+  size_t getSerializationSize() override {
+    return SerializedSize(axis_) + SerializedSize(dims_x_) +
+           SerializedSize(dims_y_) + getBaseSerializationSize();
+  }
+
+  void serialize(void *buffer) override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, axis_);
+    SerializeValue(&buffer, dims_x_);
+    SerializeValue(&buffer, dims_y_);
+  }
+
+  nvinfer1::ElementWiseOperation type_;
+  nvinfer1::Dims dims_x_;
+  nvinfer1::Dims dims_y_;
+  int axis_;
+  int prev_size_;
+  int midd_size_;
+  int post_size_;
+};
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
index 0f1ca11295..e8f4254402 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -20,6 +20,7 @@
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
 static const int CUDA_NUM_THREADS = 1024;
 static const int CUDA_MAX_NUM_BLOCKS = 65535;
@@ -126,6 +127,7 @@ int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
   return cudaGetLastError() != cudaSuccess;
 }
 
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
index aa0f865c89..0db56a310b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -21,6 +21,7 @@
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
 class PReluPlugin : public PluginTensorRT {
   TensorRTEngine::Weight alpha_;
@@ -63,6 +64,7 @@ class PReluPlugin : public PluginTensorRT {
               void *workspace, cudaStream_t stream) override;
 };
 
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/serialize.h
index 50c0b17d78..ce859f16fc 100644
--- a/paddle/fluid/inference/tensorrt/plugin/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h
@@ -14,10 +14,15 @@
 
 #pragma once
 
-#include <cassert>
 #include <cstring>
 #include <type_traits>
 #include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
 
 template <typename T>
 inline void SerializeValue(void** buffer, T const& value);
@@ -26,7 +31,7 @@ template <typename T>
 inline void DeserializeValue(void const** buffer, size_t* buffer_size,
                              T* value);
 
-namespace {
+namespace details {
 
 template <typename T, class Enable = void>
 struct Serializer {};
@@ -36,10 +41,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
                                              std::is_enum<T>::value ||
                                              std::is_pod<T>::value>::type> {
   static size_t SerializedSize(T const& value) { return sizeof(T); }
+
   static void Serialize(void** buffer, T const& value) {
     std::memcpy(*buffer, &value, sizeof(T));
     reinterpret_cast<char*&>(*buffer) += sizeof(T);
   }
+
   static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
     assert(*buffer_size >= sizeof(T));
     std::memcpy(value, *buffer, sizeof(T));
@@ -51,10 +58,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
 template <>
 struct Serializer<const char*> {
   static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
+
   static void Serialize(void** buffer, const char* value) {
-    std::strcpy(static_cast<char*>(*buffer), value);
+    std::strcpy(static_cast<char*>(*buffer), value);  // NOLINT
     reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
   }
+
   static void Deserialize(void const** buffer, size_t* buffer_size,
                           const char** value) {
     *value = static_cast<char const*>(*buffer);
@@ -73,39 +82,46 @@ struct Serializer<std::vector<T>,
   static size_t SerializedSize(std::vector<T> const& value) {
     return sizeof(value.size()) + value.size() * sizeof(T);
   }
+
   static void Serialize(void** buffer, std::vector<T> const& value) {
     SerializeValue(buffer, value.size());
     size_t nbyte = value.size() * sizeof(T);
     std::memcpy(*buffer, value.data(), nbyte);
     reinterpret_cast<char*&>(*buffer) += nbyte;
   }
+
   static void Deserialize(void const** buffer, size_t* buffer_size,
                           std::vector<T>* value) {
     size_t size;
     DeserializeValue(buffer, buffer_size, &size);
     value->resize(size);
     size_t nbyte = value->size() * sizeof(T);
-    assert(*buffer_size >= nbyte);
+    PADDLE_ENFORCE_GE(*buffer_size, nbyte);
     std::memcpy(value->data(), *buffer, nbyte);
     reinterpret_cast<char const*&>(*buffer) += nbyte;
     *buffer_size -= nbyte;
   }
 };
 
-}  // namespace
+}  // namespace details
 
 template <typename T>
 inline size_t SerializedSize(T const& value) {
-  return Serializer<T>::SerializedSize(value);
+  return details::Serializer<T>::SerializedSize(value);
 }
 
 template <typename T>
 inline void SerializeValue(void** buffer, T const& value) {
-  return Serializer<T>::Serialize(buffer, value);
+  return details::Serializer<T>::Serialize(buffer, value);
 }
 
 template <typename T>
 inline void DeserializeValue(void const** buffer, size_t* buffer_size,
                              T* value) {
-  return Serializer<T>::Deserialize(buffer, buffer_size, value);
+  return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
 }
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index bd6a44dcc1..4adea2db1e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,26 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <stdio.h>
-#include <cassert>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
-nvinfer1::Dims SplitPlugin::getOutputDimensions(int index,
-                                                const nvinfer1::Dims* inputDims,
-                                                int nbInputs) {
-  assert(nbInputs == 1);
-  assert(index < this->getNbOutputs());
-  nvinfer1::Dims const& input_dims = inputDims[0];
-  nvinfer1::Dims output_dims = input_dims;
+nvinfer1::Dims SplitPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
+  PADDLE_ENFORCE_EQ(num_inputs, 1);
+  PADDLE_ENFORCE_LT(index, this->getNbOutputs());
+
+  nvinfer1::Dims output_dims = input_dims[0];
   output_dims.d[axis_] = output_length_.at(index);
   return output_dims;
 }
 
 int SplitPlugin::initialize() {
+  PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
+
   std::vector<int> segment_offsets(1, 0);
   for (int i = 0; i < this->getNbOutputs(); ++i) {
     segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
@@ -76,6 +76,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
   return cudaGetLastError() != cudaSuccess;
 }
 
-}  // tensorrt
-}  // inference
-}  // paddle
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 7281e40c33..b5b6e69992 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -14,61 +14,58 @@
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
 class SplitPlugin : public PluginTensorRT {
-  int axis_;
-  std::vector<int> output_length_;
-  int nx_, ny_, nz_;
-  std::vector<int> segment_offsets_;
+ public:
+  SplitPlugin(int axis, std::vector<int> const &output_lengths)
+      : axis_(axis), output_length_(output_lengths) {}
+
+  SplitPlugin(void const *serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
+    DeserializeValue(&serial_data, &serial_length, &axis_);
+    DeserializeValue(&serial_data, &serial_length, &output_length_);
+  }
+
+  SplitPlugin *clone() const override {
+    return new SplitPlugin(axis_, output_length_);
+  }
+
+  const char *getPluginType() const override { return "split"; }
+  int getNbOutputs() const override { return output_length_.size(); }
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims *input_dims,
+                                     int num_inputs) override;
+
+  int initialize() override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
 
  protected:
-  virtual size_t getSerializationSize() override {
+  size_t getSerializationSize() override {
     return SerializedSize(axis_) + SerializedSize(output_length_) +
            getBaseSerializationSize();
   }
 
-  // TRT will call this func when we need to serialize the configuration of
-  // tensorrt.
-  // It should not be called by users.
-  virtual void serialize(void *buffer) override {
+  void serialize(void *buffer) override {
     serializeBase(buffer);
     SerializeValue(&buffer, axis_);
     SerializeValue(&buffer, output_length_);
   }
 
- public:
-  SplitPlugin(int axis, std::vector<int> const &output_lengths)
-      : axis_(axis), output_length_(output_lengths) {
-    assert(axis <= nvinfer1::Dims::MAX_DIMS);
-  }
-
-  // It was used for tensorrt deserialization.
-  // It should not be called by users.
-  SplitPlugin(void const *serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
-    DeserializeValue(&serialData, &serialLength, &axis_);
-    DeserializeValue(&serialData, &serialLength, &output_length_);
-  }
-
-  SplitPlugin *clone() const override {
-    return new SplitPlugin(axis_, output_length_);
-  }
-
-  virtual const char *getPluginType() const override { return "split"; }
-  virtual int getNbOutputs() const override { return output_length_.size(); }
-  virtual nvinfer1::Dims getOutputDimensions(int index,
-                                             const nvinfer1::Dims *inputs,
-                                             int nbInputDims) override;
-  virtual int initialize() override;
-  virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
-                      void *workspace, cudaStream_t stream) override;
+  int axis_;
+  std::vector<int> output_length_;
+  int nx_, ny_, nz_;
+  std::vector<int> segment_offsets_;
 };
 
-}  // tensorrt
-}  // inference
-}  // paddle
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
index 08016d84b1..b0f4cff3ac 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -17,6 +17,7 @@
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
 void PluginTensorRT::serializeBase(void*& buffer) {
   SerializeValue(&buffer, input_dims_);
@@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) {
   SerializeValue(&buffer, data_format_);
 }
 
-void PluginTensorRT::deserializeBase(void const*& serialData,
-                                     size_t& serialLength) {
-  DeserializeValue(&serialData, &serialLength, &input_dims_);
-  DeserializeValue(&serialData, &serialLength, &max_batch_size_);
-  DeserializeValue(&serialData, &serialLength, &data_type_);
-  DeserializeValue(&serialData, &serialLength, &data_format_);
+void PluginTensorRT::deserializeBase(void const*& serial_data,
+                                     size_t& serial_length) {
+  DeserializeValue(&serial_data, &serial_length, &input_dims_);
+  DeserializeValue(&serial_data, &serial_length, &max_batch_size_);
+  DeserializeValue(&serial_data, &serial_length, &data_type_);
+  DeserializeValue(&serial_data, &serial_length, &data_format_);
 }
 
 size_t PluginTensorRT::getBaseSerializationSize() {
@@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
           (format == nvinfer1::PluginFormat::kNCHW));
 }
 
-void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims,
-                                         int nbInputs,
-                                         const nvinfer1::Dims* outputDims,
-                                         int nbOutputs, nvinfer1::DataType type,
-                                         nvinfer1::PluginFormat format,
-                                         int maxBatchSize) {
+void PluginTensorRT::configureWithFormat(
+    const nvinfer1::Dims* input_dims, int num_inputs,
+    const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type,
+    nvinfer1::PluginFormat format, int max_batch_size) {
   data_type_ = type;
   data_format_ = format;
-  input_dims_.assign(inputDims, inputDims + nbInputs);
-  max_batch_size_ = maxBatchSize;
+  input_dims_.assign(input_dims, input_dims + num_inputs);
+  max_batch_size_ = max_batch_size;
 }
 
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 4d85e955a4..86084829e1 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -14,23 +14,30 @@
 
 #pragma once
 
-#include <cassert>
+#include <NvInfer.h>
 #include <cstring>
-#include <iostream>
 #include <unordered_map>
 #include <vector>
-#include "NvInfer.h"
 
 #include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(profile);
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
 class PluginTensorRT : public nvinfer1::IPluginExt {
  public:
   PluginTensorRT() {}
+  // It was used for TensorRT deserialization.
+  // It should not be called by users.
   PluginTensorRT(const void* serialized_data, size_t length) {}
+  virtual ~PluginTensorRT() {}
+
   nvinfer1::Dims const& getInputDims(int index) const {
     return input_dims_.at(index);
   }
@@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
   nvinfer1::DataType getDataType() const { return data_type_; }
   nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
   virtual const char* getPluginVersion() const { return "1"; }
+
+  void AddInput(nvinfer1::ITensor* input) { inputs_.push_back(input); }
+  std::vector<nvinfer1::ITensor*>& GetInputs() { return inputs_; }
+
+  virtual nvinfer1::IPluginExt* clone() const = 0;
+  virtual const char* getPluginType() const = 0;
+
+  // Following functions are inherit from nvinfer1::IPluginExt
+  // Get the number of outputs from the layer
+  int getNbOutputs() const { return 1; }
+  // Get the dimension of an output tensor
+  virtual nvinfer1::Dims getOutputDimensions(int index,
+                                             const nvinfer1::Dims* input_dims,
+                                             int num_inputs) = 0;
+  // Find the workspace size required by the layer
   size_t getWorkspaceSize(int) const override { return 0; }
+
+  // Initialize the layer for execution.
+  // This is called when the engine is created.
+  int initialize() override { return 0; }
+  // Shutdown the layer. This is called when the engine is destroyed
   void terminate() override {}
-  virtual ~PluginTensorRT() {}
+  // Execute the layer
+  virtual int enqueue(int batch_size, const void* const* inputs, void** outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+
+  // Find the size of the serialization buffer required
+  virtual size_t getSerializationSize() = 0;
+  // Serialize the layer config to buffer.
+  // TensorRT will call this func to serialize the configuration of TensorRT
+  // engine. It should not be called by users.
+  virtual void serialize(void* buffer) = 0;
+
   // Check format support. The default is FLOAT32 and NCHW.
   bool supportsFormat(nvinfer1::DataType type,
                       nvinfer1::PluginFormat format) const override;
-  void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs,
-                           const nvinfer1::Dims* outputDims, int nbOutputs,
+  // Configure the layer
+  void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs,
+                           const nvinfer1::Dims* output_dims, int num_outputs,
                            nvinfer1::DataType type,
                            nvinfer1::PluginFormat format,
-                           int maxBatchSize) override;
-
-  // *NOTE* The following functions need to be overrided in the subclass.
-  virtual nvinfer1::IPluginExt* clone() const = 0;
-  virtual const char* getPluginType() const = 0;
-  // Initialize the layer for execution. This is called when the engine is
-  // created.
-  int initialize() override { return 0; }
-  // Serialize the layer config to buffer.
-  virtual void serialize(void* buffer) = 0;
-  virtual size_t getSerializationSize() = 0;
-  virtual int enqueue(int batchSize, const void* const* inputs, void** outputs,
-                      void* workspace, cudaStream_t stream) = 0;
+                           int max_batch_size) override;
 
  protected:
   // Deserialize input_dims, max_batch_size, data_type, data_format
-  void deserializeBase(void const*& serialData, size_t& serialLength);
+  void deserializeBase(void const*& serial_data,  // NOLINT
+                       size_t& serial_length);    // NOLINT
   size_t getBaseSerializationSize();
   // Serialize input_dims, max_batch_size, data_type, data_format
-  void serializeBase(void*& buffer);
+  void serializeBase(void*& buffer);  // NOLINT
 
   std::vector<nvinfer1::Dims> input_dims_;
   size_t max_batch_size_;
   nvinfer1::DataType data_type_;
   nvinfer1::PluginFormat data_format_;
+
+  std::vector<nvinfer1::ITensor*> inputs_;
 };
 
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index a404691413..e66ae28057 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -51,7 +51,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
     LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
     return;
   }
-  LOG(INFO) << *config;
+  LOG(INFO) << *reinterpret_cast<const NativeConfig *>(config);
 }
 
 void CompareResult(const std::vector<PaddleTensor> &outputs,

From 09ee266f8ebfb6b9e9011e41725d4cd94b141612 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Tue, 20 Nov 2018 11:38:45 +0800
Subject: [PATCH 46/80] disable two openblas test temporary

test=develop
---
 paddle/fluid/inference/tests/api/CMakeLists.txt    | 8 ++++----
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 16a9b50e6f..cf2a61ea61 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -27,14 +27,14 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename
 endfunction()
 
 # RNN1
-if(NOT APPLE)
+if(NOT APPLE AND WITH_MKLML)
     set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
     download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
 else()
-    # TODO: fix this test on MACOS, the reason is that
-    # fusion_seqexpand_concat_fc_op is not supported on MACOS
-    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1")
+    # TODO: fix this test on MACOS and OPENBLAS, the reason is that
+    # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
+    message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1")
 endif()
 
 # RNN2
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1513eca514..29e4ca04a7 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -45,6 +45,10 @@ if(APPLE)
     list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
     list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
 endif()
+if(NOT WITH_MKLML)
+    # this op is not support on openblas
+    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
+endif()
 
 function(py_test_modules TARGET_NAME)
   if(WITH_TESTING)

From 01bda731165d40e1e7b562af8c4faa2d957366d8 Mon Sep 17 00:00:00 2001
From: Houjiang Chen <chenhoujiangcug@gmail.com>
Date: Tue, 20 Nov 2018 12:52:07 +0800
Subject: [PATCH 47/80] Update CMakeLists.txt

---
 paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 203101e708..9f69c8ef0d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1,3 +1,3 @@
 nv_library(tensorrt_plugin
-           SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu
+           SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu elementwise_op_plugin.cu
            DEPS enforce tensorrt_engine)

From 33c65517fd92074cbf79a31d845385c8f9d686ac Mon Sep 17 00:00:00 2001
From: Houjiang Chen <chenhoujiangcug@gmail.com>
Date: Tue, 20 Nov 2018 12:52:26 +0800
Subject: [PATCH 48/80] Update CMakeLists.txt test=develop

---
 paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 9f69c8ef0d..a0329325be 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1,3 +1,3 @@
 nv_library(tensorrt_plugin
-           SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu elementwise_op_plugin.cu
+           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
            DEPS enforce tensorrt_engine)

From b742d465203d9f57e5fe295230ff130550db2dfe Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Tue, 20 Nov 2018 06:03:56 +0000
Subject: [PATCH 49/80] fix demo ci bug on trt

---
 paddle/fluid/inference/api/analysis_predictor.cc      | 2 ++
 paddle/fluid/inference/api/paddle_pass_builder.h      | 6 +++++-
 paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt | 2 +-
 paddle/fluid/inference/tests/api/trt_models_tester.cc | 2 --
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3a707907d9..814542cd0b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -551,4 +551,6 @@ USE_TRT_CONVERTER(pad);
 USE_TRT_CONVERTER(split);
 USE_TRT_CONVERTER(prelu);
 USE_TRT_CONVERTER(conv2d_transpose);
+
+USE_PASS(tensorrt_subgraph_pass);
 #endif
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 825bee833b..12e3a6f42e 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -116,8 +116,12 @@ class CpuPassStrategy : public PassStrategy {
 class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
+    // TODO(NHZlX) Problem with Data synchronization between GPU and CPU
+    // When running in GPU mode, the parameters are all on GPU. But the
+    // opearations of "conv_bn_fuse_pass" are on CPU.
     passes_.assign({
-        "infer_clean_graph_pass", "conv_bn_fuse_pass",
+        "infer_clean_graph_pass",
+        // "infer_clean_graph_pass", "conv_bn_fuse_pass",
     });
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 6611e2e4b3..b6811f9183 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1 +1 @@
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce)
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context)
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 922feba10f..ef612ce614 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -145,5 +145,3 @@ TEST(TensorRT_mobilenet, analysis) {
 
 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(tensorrt_subgraph_pass);

From eb9b9becdcf1829a1feef2839410707847208eed Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Tue, 20 Nov 2018 16:57:02 +0800
Subject: [PATCH 50/80] add warm up in TestMultiThreadPrediction

test=develop
---
 .../fluid/inference/tests/api/tester_helper.h | 37 ++++++++++++++-----
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index e66ae28057..7b686045a5 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -222,19 +222,36 @@ void TestMultiThreadPrediction(
       // The inputs of each thread are all the same.
       std::vector<PaddleTensor> outputs_tid;
       auto &predictor = predictors[tid];
-      LOG(INFO) << "running thread " << tid;
-      Timer timer;
-      timer.tic();
-      for (int i = 0; i < num_times; i++) {
-        for (const auto &input : inputs) {
-          ASSERT_TRUE(predictor->Run(input, &outputs_tid));
+
+      // warmup run
+      LOG(INFO) << "Running thread " << tid << ", warm up run...";
+      {
+        Timer warmup_timer;
+        warmup_timer.tic();
+        predictor->Run(inputs[0], outputs, batch_size);
+        PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
+#if !defined(_WIN32)
+        if (FLAGS_profile) {
+          paddle::platform::ResetProfiler();
         }
+#endif
       }
 
-      auto time = timer.toc();
-      total_time += time;
-      PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
-                inputs.size());
+      LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
+      {
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          for (const auto &input : inputs) {
+            ASSERT_TRUE(predictor->Run(input, &outputs_tid));
+          }
+        }
+
+        auto time = timer.toc();
+        total_time += time;
+        PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
+                  inputs.size());
+      }
     });
   }
   for (int i = 0; i < num_threads; ++i) {

From a8d3aaae2a648ee552d60869fc5117e61d4ce1b0 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Tue, 20 Nov 2018 17:30:02 +0800
Subject: [PATCH 51/80] print output log warning (#14497)

test=develop
---
 paddle/fluid/platform/init.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 9f7aa55698..e07e9d3825 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -38,6 +38,7 @@ std::once_flag p2p_init_flag;
 
 void InitGflags(std::vector<std::string> argv) {
   std::call_once(gflags_init_flag, [&]() {
+    FLAGS_logtostderr = true;
     argv.insert(argv.begin(), "dummy");
     int argc = argv.size();
     char **arr = new char *[argv.size()];

From faeb9b8aa9aff3a3a46be9c032b6ee50584b5b80 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Tue, 20 Nov 2018 09:46:06 +0000
Subject: [PATCH 52/80] fix compile rely problem

---
 paddle/fluid/inference/analysis/CMakeLists.txt   |  7 ++++---
 .../inference/analysis/ir_passes/CMakeLists.txt  |  2 ++
 paddle/fluid/inference/api/CMakeLists.txt        |  2 +-
 paddle/fluid/inference/api/analysis_predictor.cc |  2 --
 paddle/fluid/inference/tests/api/CMakeLists.txt  | 16 ++++++++++------
 5 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index eb89fc5e11..0c73778b20 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -7,16 +7,17 @@ set(analysis_deps # analysis_deps can be extended accross the project
 add_subdirectory(ir_passes)
 add_subdirectory(passes)
 
-cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES})
+cc_library(analysis_helper SRCS helper.cc DEPS framework_proto proto_desc graph paddle_fluid_api)
+
+cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES} analysis_helper)
 
 cc_library(argument SRCS argument.cc DEPS scope proto_desc)
 cc_library(analysis_pass SRCS analysis_pass.cc DEPS proto_desc)
 
 cc_library(analysis SRCS
   analyzer.cc
-  helper.cc
   analysis_pass
-  DEPS ${analysis_deps}
+  DEPS ${analysis_deps} analysis_helper
   )
 
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index c71cff889e..822c7799bb 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -4,4 +4,6 @@ set(analysis_deps ${analysis_deps}
         subgraph_detector tensorrt_subgraph_pass
         CACHE INTERNAL "")
 
+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
 set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 2dc426033b..e9969b84f3 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -27,7 +27,7 @@ endif()
 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 814542cd0b..3a707907d9 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -551,6 +551,4 @@ USE_TRT_CONVERTER(pad);
 USE_TRT_CONVERTER(split);
 USE_TRT_CONVERTER(prelu);
 USE_TRT_CONVERTER(conv2d_transpose);
-
-USE_PASS(tensorrt_subgraph_pass);
 #endif
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 16a9b50e6f..fbe7fe7b7e 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -1,5 +1,9 @@
 set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
 
+if(WITH_GPU AND TENSORRT_FOUND)
+    set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor)
+endif()
+
 function(download_model install_dir model_name)
     if (NOT EXISTS ${install_dir})
         inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
@@ -75,11 +79,11 @@ endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
 
 # resnet50
-inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 
+inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
   "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
 
 # mobilenet with depthwise_conv op
-inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet 
+inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
   "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
 
 # anakin
@@ -89,15 +93,15 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
    inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
    inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
-   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc 
-           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
+   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
+           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
                 --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
            DEPS inference_anakin_api_shared SERIAL)
    # anakin mobilenet
    if(WITH_GPU)
        set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
        inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
-       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc 
+       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
                ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
    endif()
@@ -109,6 +113,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
    endif()
    inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
-      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor
+      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()

From b3364d40350c95e7fc804f79dfac42057590c108 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 21 Nov 2018 10:03:50 +0800
Subject: [PATCH 53/80] fix(Macos): fix compile on macos

test=develop
---
 paddle/fluid/memory/allocation/best_fit_allocator_test.cc | 1 +
 paddle/fluid/memory/allocation/best_fit_allocator_test.cu | 1 +
 2 files changed, 2 insertions(+)

diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
index 4122b3d709..20748a23a1 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+#include <random>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
index 50aecda97a..f7f17e1d36 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <random>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"

From 175b847f6dd900456dc0f0a39cb1eb3394431ea6 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 12:24:25 +0800
Subject: [PATCH 54/80] Add API example for logical ops and clip ops

test=develop
---
 python/paddle/fluid/layers/nn.py | 250 ++++++++++++++++++-------------
 1 file changed, 149 insertions(+), 101 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 99acd7e308..7b0a3e2c82 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -726,11 +726,11 @@ def dynamic_gru(input,
             create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates 
+            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
             the bias in the update gate, reset gate and candidate calculations.
-            If it is set to False, no bias will be applied to the update gate, 
-            reset gate and candidate calculations. If it is set to None or one 
-            attribute of ParamAttr, dynamic_gru will create ParamAttr as 
+            If it is set to False, no bias will be applied to the update gate,
+            reset gate and candidate calculations. If it is set to None or one
+            attribute of ParamAttr, dynamic_gru will create ParamAttr as
             bias_attr. If the Initializer of the bias_attr is not set, the bias
             is initialized zero. Default: None.
         is_reverse(bool): Whether to compute reversed GRU, default
@@ -847,11 +847,11 @@ def gru_unit(input,
             create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates 
+            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
             the bias in the update gate, reset gate and candidate calculations.
-            If it is set to False, no bias will be applied to the update gate, 
-            reset gate and candidate calculations. If it is set to None or one 
-            attribute of ParamAttr, gru_unit will create ParamAttr as 
+            If it is set to False, no bias will be applied to the update gate,
+            reset gate and candidate calculations. If it is set to None or one
+            attribute of ParamAttr, gru_unit will create ParamAttr as
             bias_attr. If the Initializer of the bias_attr is not set, the bias
             is initialized zero. Default: None.
         activation (string): The activation type for cell (actNode).
@@ -1064,9 +1064,9 @@ def dropout(x,
                                            inference: out = input
                                            (make is a tensor same shape with input, value is 0 or 1
                                             ratio of 0 is dropout_prob)
-                                           dropout op can be removed from the program. 
+                                           dropout op can be removed from the program.
                                            the program will be efficient
-                                        
+
 
 
     Returns:
@@ -2149,7 +2149,7 @@ def pool2d(input,
         ceil_mode (bool): ${ceil_mode_comment}
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
-        exclusive (bool): Whether to exclude padding points in average pooling 
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is true
 
     Returns:
@@ -2240,7 +2240,7 @@ def pool3d(input,
         ceil_mode (bool): ${ceil_mode_comment}
         name (str): A name for this layer(optional). If set None, the layer
             will be named automatically.
-        exclusive (bool): Whether to exclude padding points in average pooling 
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is true
 
     Returns:
@@ -4342,7 +4342,7 @@ def nce(input,
         sampler (str): The sampler used to sample class from negtive classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
                        default: 'uniform'.
-        custom_dist (Variable): A tensor with shape [num_total_classes]. 
+        custom_dist (Variable): A tensor with shape [num_total_classes].
                        It is used when sampler is set to 'custom_dist'.
                        custom_dist[i] is the probsbility of i-th class to be sampled.
                        default: None.
@@ -4385,7 +4385,7 @@ def nce(input,
                           num_neg_samples=3,
                           sampler="custom_dist",
                           custom_dist=dist)
-            
+
     """
     helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)
@@ -4556,9 +4556,9 @@ def transpose(x, perm, name=None):
     Examples:
         .. code-block:: python
 
-            # use append_batch_size=False to avoid prepending extra 
+            # use append_batch_size=False to avoid prepending extra
             # batch size in shape
-            x = fluid.layers.data(name='x', shape=[5, 10, 15], 
+            x = fluid.layers.data(name='x', shape=[5, 10, 15],
                             dtype='float32', append_batch_size=False)
             x_transposed = layers.transpose(x, perm=[1, 0, 2])
     """
@@ -4835,7 +4835,7 @@ def softmax_with_cross_entropy(logits,
     3) If numeric_stable_mode is True, softmax is calculated first by:
 
     .. math::
-        
+
         max_j = \\max_{i=0}^{K}{\\text{logit}_i}
 
         log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
@@ -4858,18 +4858,18 @@ def softmax_with_cross_entropy(logits,
         numeric_stable_mode (bool): A flag to indicate whether to use a more
                                     numerically stable algorithm. Only valid
                                     when soft_label is False and GPU is used.
-                                    When soft_label is True or CPU is used, 
-                                    the algorithm is always numerically stable. 
-                                    Note that the speed may be slower when use 
+                                    When soft_label is True or CPU is used,
+                                    the algorithm is always numerically stable.
+                                    Note that the speed may be slower when use
                                     stable algorithm. Default: False
-        return_softmax (bool): A flag indicating whether to return the softmax 
+        return_softmax (bool): A flag indicating whether to return the softmax
                                along with the cross entropy loss. Default: False
 
     Returns:
-        Variable or Tuple of two Variables: Return the cross entropy loss if 
-                              `return_softmax` is False, otherwise the tuple 
-                              (loss, softmax), where the cross entropy loss is 
-                              a 2-D tensor with shape [N x 1], and softmax is a 
+        Variable or Tuple of two Variables: Return the cross entropy loss if
+                              `return_softmax` is False, otherwise the tuple
+                              (loss, softmax), where the cross entropy loss is
+                              a 2-D tensor with shape [N x 1], and softmax is a
                               2-D tensor with shape [N x K].
 
     Examples:
@@ -5756,20 +5756,20 @@ def image_resize(input,
                          Default: None
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
-        resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST' 
+        resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST'
                        currently.
                        Default: 'BILINEAR'
-        actual_shape(Variable): An optional input to specify output shape 
-                                dynamically. If provided, image resize  
-                                according to this given shape rather than 
+        actual_shape(Variable): An optional input to specify output shape
+                                dynamically. If provided, image resize
+                                according to this given shape rather than
                                 :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the 
-                                highest priority. It is recommended to use 
-                                actual_shape instead of :attr:`out_shape` if you 
-                                want to specify output shape dynamically. When 
-                                using actual_shape to specify output shape, one of 
-                                :attr:`out_shape` and :attr:`scale` should also be 
-                                set, otherwise errors would be occured in graph 
+                                shape. That is to say actual_shape has the
+                                highest priority. It is recommended to use
+                                actual_shape instead of :attr:`out_shape` if you
+                                want to specify output shape dynamically. When
+                                using actual_shape to specify output shape, one of
+                                :attr:`out_shape` and :attr:`scale` should also be
+                                set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
 
@@ -5780,7 +5780,7 @@ def image_resize(input,
     Raises:
         TypeError: out_shape should be a list or tuple or Variable.
         TypeError: actual_shape should either be Variable or None.
-        ValueError: The 'resample' of image_resize can only be 'BILINEAR' 
+        ValueError: The 'resample' of image_resize can only be 'BILINEAR'
                     or 'NEAREST' currently.
         ValueError: One of out_shape and scale must not be None.
         ValueError: out_shape length should be 2.
@@ -5852,17 +5852,17 @@ def resize_bilinear(input,
                     name=None,
                     actual_shape=None):
     """
-    Resize input by performing bilinear interpolation based on given 
-    output shape which specified by actual_shape, out_shape and scale 
+    Resize input by performing bilinear interpolation based on given
+    output shape which specified by actual_shape, out_shape and scale
     in priority order.
 
-    Bilinear interpolation is an extension of linear interpolation for 
-    interpolating functions of two variables (e.g. H-direction and 
-    W-direction in this op) on a rectilinear 2D grid. The key idea is 
-    to perform linear interpolation first in one direction, and then 
+    Bilinear interpolation is an extension of linear interpolation for
+    interpolating functions of two variables (e.g. H-direction and
+    W-direction in this op) on a rectilinear 2D grid. The key idea is
+    to perform linear interpolation first in one direction, and then
     again in the other direction.
 
-    For details of bilinear interpolation, please refer to Wikipedia: 
+    For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation
 
     Args:
@@ -5875,17 +5875,17 @@ def resize_bilinear(input,
              a higher priority than scale. Default: None.
 
         name(str|None): The output variable name.
-        actual_shape(Variable): An optional input to specify output shape 
-                                dynamically. If provided, image resize  
-                                according to this given shape rather than 
+        actual_shape(Variable): An optional input to specify output shape
+                                dynamically. If provided, image resize
+                                according to this given shape rather than
                                 :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the 
-                                highest priority. It is recommended to use 
-                                actual_shape instead of :attr:`out_shape` if you 
-                                want to specify output shape dynamically. When 
-                                using actual_shape to specify output shape, one of 
-                                :attr:`out_shape` and :attr:`scale` should also be 
-                                set, otherwise errors would be occured in graph 
+                                shape. That is to say actual_shape has the
+                                highest priority. It is recommended to use
+                                actual_shape instead of :attr:`out_shape` if you
+                                want to specify output shape dynamically. When
+                                using actual_shape to specify output shape, one of
+                                :attr:`out_shape` and :attr:`scale` should also be
+                                set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
 
@@ -5909,11 +5909,11 @@ def resize_nearest(input,
                    actual_shape=None):
     """
     Resize input by performing nearest neighbor interpolation in both the
-    3rd dimention(in height direction) and the 4th dimention(in width 
-    direction) based on given output shape which specified by actual_shape, 
+    3rd dimention(in height direction) and the 4th dimention(in width
+    direction) based on given output shape which specified by actual_shape,
     out_shape and scale in priority order.
 
-    For details of nearest neighbor interpolation, please refer to Wikipedia: 
+    For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
     Args:
@@ -5926,17 +5926,17 @@ def resize_nearest(input,
              a higher priority than scale. Default: None.
 
         name(str|None): The output variable name.
-        actual_shape(Variable): An optional input to specify output shape 
-                                dynamically. If provided, image resize  
-                                according to this given shape rather than 
+        actual_shape(Variable): An optional input to specify output shape
+                                dynamically. If provided, image resize
+                                according to this given shape rather than
                                 :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the 
-                                highest priority. It is recommended to use 
-                                actual_shape instead of :attr:`out_shape` if you 
-                                want to specify output shape dynamically. When 
-                                using actual_shape to specify output shape, one of 
-                                :attr:`out_shape` and :attr:`scale` should also be 
-                                set, otherwise errors would be occured in graph 
+                                shape. That is to say actual_shape has the
+                                highest priority. It is recommended to use
+                                actual_shape instead of :attr:`out_shape` if you
+                                want to specify output shape dynamically. When
+                                using actual_shape to specify output shape, one of
+                                :attr:`out_shape` and :attr:`scale` should also be
+                                set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
 
@@ -6446,15 +6446,15 @@ def affine_grid(theta, out_shape, name=None):
                         [x_14, x_15, x_16]]
                        [[x_21, x_22, x_23]
                         [x_24, x_25, x_26]]]
-      
+
               out_shape = [2, 3, 5, 5]
-      
+
           Step 1:
-      
+
               Generate normalized coordinates according to out_shape.
               The values of the normalized coordinates are in the interval between -1 and 1.
               The shape of the normalized coordinates is [2, H, W] as below:
-      
+
               C = [[[-1.  -1.  -1.  -1.  -1. ]
                     [-0.5 -0.5 -0.5 -0.5 -0.5]
                     [ 0.   0.   0.   0.   0. ]
@@ -7702,6 +7702,15 @@ def logical_and(x, y, out=None, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            left = fluid.layers.data(
+                name='left', shape=[1], dtype='int32')
+            right = fluid.layers.data(
+                name='right', shape=[1], dtype='int32')
+            result = fluid.layers.logical_and(x=left, y=right)
     """
 
     return _logical_op(
@@ -7721,6 +7730,15 @@ def logical_or(x, y, out=None, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            left = fluid.layers.data(
+                name='left', shape=[1], dtype='int32')
+            right = fluid.layers.data(
+                name='right', shape=[1], dtype='int32')
+            result = fluid.layers.logical_or(x=left, y=right)
     """
 
     return _logical_op(
@@ -7740,6 +7758,15 @@ def logical_xor(x, y, out=None, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            left = fluid.layers.data(
+                name='left', shape=[1], dtype='int32')
+            right = fluid.layers.data(
+                name='right', shape=[1], dtype='int32')
+            result = fluid.layers.logical_xor(x=left, y=right)
     """
 
     return _logical_op(
@@ -7758,6 +7785,13 @@ def logical_not(x, out=None, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            left = fluid.layers.data(
+                name='left', shape=[1], dtype='int32')
+            result = fluid.layers.logical_not(x=left)
     """
 
     return _logical_op(
@@ -7777,6 +7811,13 @@ def clip(x, min, max, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(
+                name='data', shape=[1], dtype='float32')
+            reward = fluid.layers.clip(x=input, min=-1.0, max=1.0)
     """
 
     helper = LayerHelper("clip", **locals())
@@ -7809,6 +7850,13 @@ def clip_by_norm(x, max_norm, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(
+                name='data', shape=[1], dtype='float32')
+            reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
     """
 
     helper = LayerHelper("clip_by_norm", **locals())
@@ -7954,19 +8002,19 @@ def maxout(x, groups, name=None):
 def space_to_depth(x, blocksize, name=None):
     """
     Gives a blocksize to space_to_depth the input LoDtensor with Layout: [batch, channel, height, width]
-    
-    This op rearranges blocks of spatial data, into depth. More specifically, this op outputs a copy of the 
-    input LoDtensor where values from the height and width dimensions are moved to the channel dimension. 
+
+    This op rearranges blocks of spatial data, into depth. More specifically, this op outputs a copy of the
+    input LoDtensor where values from the height and width dimensions are moved to the channel dimension.
     The attr blocksize indicates the input block size.
-    
-    space_to_depth will reorgnize the elements of input with shape[batch, channel, height, width] according 
+
+    space_to_depth will reorgnize the elements of input with shape[batch, channel, height, width] according
     to blocksize to construct output with shape [batch, channel * blocksize * blocksize, height/blocksize, width/blocksize]:
-    
-    space_to_depth is used to This operation is useful for resizing the activations between convolutions 
+
+    space_to_depth is used to This operation is useful for resizing the activations between convolutions
     (but keeping all data)
 
     - Non-overlapping blocks of size block_size x block size are rearranged into depth at each location.
-    - The depth of the output tensor is block_size * block_size * input channel 
+    - The depth of the output tensor is block_size * block_size * input channel
     - The Y, X coordinates within each block of the input become the high order component of the output channel index
     - channel should be divisible by square of blocksize
     - height, width should be divsible by blocksize
@@ -8013,7 +8061,7 @@ def space_to_depth(x, blocksize, name=None):
 
 @templatedoc()
 def sequence_reverse(x, name=None):
-    """ 
+    """
     ${comment}
 
     Args:
@@ -8080,21 +8128,21 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
 
 
 def similarity_focus(input, axis, indexes, name=None):
-    """  
+    """
     SimilarityFocus Operator
 
     Generate a similarity focus mask with the same shape of input using the following method:
-    1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding 
-       to the axis according to the indexes. For example, if axis=1 and indexes=[a], 
-       it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X 
+    1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
+       to the axis according to the indexes. For example, if axis=1 and indexes=[a],
+       it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
        is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
-    2. For each index, find the largest numbers in the tensor T, so that the same 
-       row and same column has at most one number(what it means is that if the 
-       largest number has been found in the i-th row and the j-th column, then 
-       the numbers in the i-th row or j-th column will be skipped. And then the 
-       next largest number will be selected from the remaining numbers. Obviously 
-       there will be min(B, C) numbers), and mark the corresponding position of the 
-       3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
+    2. For each index, find the largest numbers in the tensor T, so that the same
+       row and same column has at most one number(what it means is that if the
+       largest number has been found in the i-th row and the j-th column, then
+       the numbers in the i-th row or j-th column will be skipped. And then the
+       next largest number will be selected from the remaining numbers. Obviously
+       there will be min(B, C) numbers), and mark the corresponding position of the
+       3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
        each index.
     3. Broadcast the 3-D similarity focus mask to the same shape of input X.
 
@@ -8150,16 +8198,16 @@ def similarity_focus(input, axis, indexes, name=None):
                               [1.0, 0.0]]]]
 
     Args:
-        input(Variable): The input tensor variable(default float). It should 
+        input(Variable): The input tensor variable(default float). It should
             be a 4-D tensor with shape [BatchSize, A, B, C].
         axis(int): Indicating the dimension to be selected. It can only be
             1, 2 or 3.
         indexes(list): Indicating the indexes of the selected dimension.
 
     Returns:
-        Variable: A tensor variable with the same shape and same type 
+        Variable: A tensor variable with the same shape and same type
             as the input.
-        
+
     Examples:
         .. code-block:: python
             data = fluid.layers.data(
@@ -8262,12 +8310,12 @@ def hash(input, hash_size, num_hash=1, name=None):
 @templatedoc()
 def grid_sampler(x, grid, name=None):
     """
-    This operation samples input X by using bilinear interpolation based on 
+    This operation samples input X by using bilinear interpolation based on
     flow field grid, which is usually gennerated by affine_grid. The grid of
-    shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
-    with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
-    (in width dimension) of input data x and grid_y is indexng the 3rd 
-    dimention (in height dimension), finally results is the bilinear 
+    shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
+    with shape [N, H, W] each, where grid_x is indexing the 4th dimension
+    (in width dimension) of input data x and grid_y is indexng the 3rd
+    dimention (in height dimension), finally results is the bilinear
     interpolation value of 4 nearest corner points.
 
     Step 1:
@@ -8277,7 +8325,7 @@ def grid_sampler(x, grid, name=None):
     grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
 
     Step 2:
-    Indices input data X with grid (x, y) in each [H, W] area, and bilinear 
+    Indices input data X with grid (x, y) in each [H, W] area, and bilinear
     interpolate point value by 4 nearest points.
 
       wn ------- y_n ------- en
@@ -8314,7 +8362,7 @@ def grid_sampler(x, grid, name=None):
         name (str, default None): The name of this layer.
 
     Returns:
-        out(Variable): Output of shape [N, C, H, W] data samples input X 
+        out(Variable): Output of shape [N, C, H, W] data samples input X
         using bilnear interpolation based on input grid.
 
     Exmples:

From cda60311f94aea91f8abd0394446d12095d1a8a7 Mon Sep 17 00:00:00 2001
From: Dang Qingqing <dangqingqing@baidu.com>
Date: Tue, 20 Nov 2018 13:45:33 +0800
Subject: [PATCH 55/80] Fix compling with cuDNN v5

test=develop
---
 paddle/fluid/operators/CMakeLists.txt              | 9 ++++++---
 paddle/fluid/operators/conv_fusion_op.cu.cc        | 4 ++++
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++++
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 975c3bfc33..ca5b30e7b8 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -36,15 +36,18 @@ endif()
 
 register_operators(EXCLUDES warpctc_op conv_fusion_op)
 
-# warpctc_cudnn need cudnn 7 above
+# warpctc_op needs cudnn 7 above
 if (WITH_GPU)
     if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
     else()
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
     endif()
-  op_library(conv_fusion_op)
-  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
+    # conv_fusion_op needs cudnn 7 above
+    if (NOT ${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+        op_library(conv_fusion_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
+    endif()
 else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index bd1041ce08..2c09ee7394 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -22,6 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search);
 namespace paddle {
 namespace operators {
 
+#if CUDNN_VERSION >= 7001
 using Tensor = framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
@@ -178,10 +179,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
   }
 };
+#endif
 
 }  // namespace operators
 }  // namespace paddle
 
+#if CUDNN_VERSION >= 7001
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
                         ops::CUDNNConvFusionOpKernel<double>);
+#endif
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1513eca514..7101506f99 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,6 +23,10 @@ if(NOT WITH_DISTRIBUTE)
     LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)
 
+if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+endif()
+
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185

From 53760bb111c703f319ea3492c6ede13384095584 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 13:29:51 +0800
Subject: [PATCH 56/80] Change requirements to support python 3.7

test=develop
---
 python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/requirements.txt b/python/requirements.txt
index 84cf440397..2f81d85df0 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,5 +1,5 @@
 requests==2.9.2
-numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
+numpy>=1.12
 protobuf==3.1
 recordio>=0.1.0
 matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib

From 3edd32d07083473f7900329bf68a6263ff9b06d3 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 21 Nov 2018 13:53:54 +0800
Subject: [PATCH 57/80] fix(Compile): fix depends error when compile op using
 cub

some operators depend on cub and xxhash by header. The dependency should be declared explicitly rather than declared to pybind.

test=develop
---
 paddle/fluid/operators/CMakeLists.txt | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 975c3bfc33..9a98ba6d9d 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -34,7 +34,12 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
-register_operators(EXCLUDES warpctc_op conv_fusion_op)
+SET(OP_HEADER_DEPS xxhash)
+if (WITH_GPU)
+    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
+endif()
+
+register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})
 
 # warpctc_cudnn need cudnn 7 above
 if (WITH_GPU)
@@ -49,14 +54,14 @@ else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
 
-set(COMMON_OP_DEPS "")
+set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 if (NOT WIN32)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 endif()
 if (WITH_GPU)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv)
 endif()
 
 # FIXME(typhoonzero): operator deps may not needed.

From c19ff1f3d28b38867de8b98d63f19b8c759c4535 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 15:37:36 +0800
Subject: [PATCH 58/80] Add python3.6 and python3.7 support in padde build
 scripts

test=develop
---
 paddle/scripts/paddle_build.sh | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 32f9bca645..569e56e5a9 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -116,6 +116,18 @@ function cmake_gen() {
                 export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
+            elif [ "$1" == "cp36-cp36m" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
+            elif [ "$1" == "cp37-cp37m" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
            fi
         fi
     fi
@@ -419,7 +431,7 @@ function assert_api_not_changed() {
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
-    if [ "$1" == "cp35-cp35m" ]; then
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' new.spec
         sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec

From 255cc1eb6540785c8cb786a6c9f291fa53010ca0 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 15:43:17 +0800
Subject: [PATCH 59/80] Add support for Mac build

test=develop
---
 paddle/scripts/paddle_build.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 569e56e5a9..9632eaec00 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -94,6 +94,30 @@ function cmake_gen() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp36-cp36m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
+        elif [ "$1" == "cp37-cp37m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
         fi
     else
         if [ "$1" != "" ]; then

From 9bb1f66ddba67bfc7a3cb601917207c389305f31 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 18:41:51 +0800
Subject: [PATCH 60/80] Polish code

test=develop
---
 tools/manylinux1/Dockerfile.x64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index 4468220a4d..e91216a5b8 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -36,7 +36,7 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
     tar xzf protobuf-cpp-3.1.0.tar.gz && \
     cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
 
-RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \

From 3e3599f3d937e0444606056f3c9f2261b74dfd93 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Wed, 21 Nov 2018 11:31:04 +0000
Subject: [PATCH 61/80] Refine split tensorrt plugin

---
 .../inference/tensorrt/convert/split_op.cc    |   3 +-
 .../tensorrt/plugin/split_op_plugin.cu        | 157 ++++++++++++++----
 .../tensorrt/plugin/split_op_plugin.h         |   9 +-
 3 files changed, 134 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 6620c76318..871354267e 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -40,7 +40,7 @@ class SplitOpConverter : public OpConverter {
     int axis = boost::get<int>(op_desc.GetAttr("axis"));
     std::vector<int> output_lengths =
         boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
-    PADDLE_ENFORCE(axis != 0);
+    // PADDLE_ENFORCE(axis != 0);
     if (axis < 0) {
       axis += input_dims.nbDims;
     } else {
@@ -48,7 +48,6 @@ class SplitOpConverter : public OpConverter {
     }
 
     PADDLE_ENFORCE(output_lengths.size() == output_num);
-
     //
     plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
     nvinfer1::IPluginLayer* layer =
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 4adea2db1e..1ec0753e9f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cuda_fp16.h>
+#include <algorithm>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
 namespace paddle {
@@ -19,6 +21,52 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+// copied from operators::math::SplitFunctor
+template <typename T>
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int* out_cols,
+                            int out_cols_size, T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int curr_segment = 0;
+  int curr_offset = out_cols[0];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = out_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs_data[curr_segment];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
+template <typename T>
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int fixed_out_col,
+                            T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
 nvinfer1::Dims SplitPlugin::getOutputDimensions(
     int index, const nvinfer1::Dims* input_dims, int num_inputs) {
   PADDLE_ENFORCE_EQ(num_inputs, 1);
@@ -31,48 +79,95 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(
 
 int SplitPlugin::initialize() {
   PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
-
+  // notice input dims is [C, H, W]
+  nvinfer1::Dims dims = this->getInputDims(0);
+  outer_rows_ = 1;
+  inner_cols_ = 1;
+  for (int i = 0; i < axis_; ++i) {
+    outer_rows_ *= dims.d[i];
+  }
+  for (int i = axis_ + 1; i < dims.nbDims; ++i) {
+    inner_cols_ *= dims.d[i];
+  }
+  same_shape_ = true;
   std::vector<int> segment_offsets(1, 0);
   for (int i = 0; i < this->getNbOutputs(); ++i) {
-    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
+    if (output_length_[i] != output_length_[0]) {
+      same_shape_ = false;
+    }
+    segment_offsets.push_back(segment_offsets.back() +
+                              output_length_[i] * inner_cols_);
   }
-  segment_offsets_ = segment_offsets;
-  nvinfer1::Dims dims = this->getInputDims(0);
-  nx_ = 1;
-  for (int i = dims.nbDims - 1; i > axis_; --i) {
-    nx_ *= dims.d[i];
+  inner_cols_ *= dims.d[axis_];
+  d_segment_offsets_ = segment_offsets;
+  segment_offsets_ = std::move(segment_offsets);
+  d_output_ptrs_.resize(this->getNbOutputs(), nullptr);
+  return 0;
+}
+
+template <typename T>
+inline void Split(cudaStream_t stream, const bool same_shape,
+                  const int outer_rows, const int inner_cols,
+                  const std::vector<int>& segment_offsets,
+                  const int* d_segment_offsets, const T* input, T** outputs) {
+  const int kThreadsPerBlock = 1024;
+  const int kMaxBlocks = 65535;
+  int block_cols = kThreadsPerBlock;
+  if (inner_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    block_cols = ((inner_cols + 31) >> 5) << 5;
   }
-  ny_ = dims.d[axis_];
-  nz_ = 1;
-  for (int i = axis_ - 1; i >= 0; --i) {
-    nz_ *= dims.d[i];
+  int block_rows = kThreadsPerBlock / block_cols;
+  dim3 block_size = dim3(block_cols, block_rows, 1);
+
+  int grid_cols =
+      std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks);
+  int grid_rows =
+      std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1));
+  dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+
+  if (same_shape) {
+    SplitKernel<<<grid_size, block_size, 0, stream>>>(
+        input, outer_rows, inner_cols, segment_offsets[1], outputs);
+  } else {
+    SplitKernel<<<grid_size, block_size, 0, stream>>>(
+        input, outer_rows, inner_cols, d_segment_offsets,
+        static_cast<int>(segment_offsets.size()), outputs);
   }
-  return 0;
 }
 
 int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                          void** outputs, void* workspace, cudaStream_t stream) {
-  auto const& input_dims = this->getInputDims(0);
-  int input_size = 0;
-  float const* idata = reinterpret_cast<float const*>(inputs[0]);
-  float** odatas = reinterpret_cast<float**>(outputs);
-
-  // kernel impl here.
-  int inputBatchOffset = nx_ * ny_ * nz_;
-  for (size_t i = 0; i < this->getNbOutputs(); i++) {
-    for (size_t j = 0; j < batchSize; j++) {
-      cudaMemcpyAsync(
-          odatas[i] +
-              j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ *
-                  sizeof(float),
-          inputs[0] +
-              (inputBatchOffset * j + segment_offsets_[i] * nx_) *
-                  sizeof(float),
-          (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float),
-          cudaMemcpyDeviceToDevice, stream);
+  float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
+  if (axis_ == -1 && this->getNbOutputs() < 10) {
+    float** output_ptrs = reinterpret_cast<float**>(outputs);
+    int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT)
+                             ? sizeof(__half)
+                             : sizeof(float);
+    for (int i = 0; i < this->getNbOutputs(); ++i) {
+      PADDLE_ENFORCE(
+          cudaMemcpyAsync(
+              output_ptrs[i], input_ptr + segment_offsets_[i],
+              (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size,
+              cudaMemcpyDeviceToDevice, stream) == cudaSuccess);
+    }
+  } else {
+    outer_rows_ *= batchSize;
+    const int* d_segment_offsets_ptr =
+        thrust::raw_pointer_cast(&d_segment_offsets_[0]);
+    float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
+    PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs,
+                                   this->getNbOutputs() * sizeof(float*),
+                                   cudaMemcpyHostToDevice,
+                                   stream) == cudaSuccess);
+    if (this->getDataType() == nvinfer1::DataType::kFLOAT) {
+      Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
+            d_segment_offsets_ptr, input_ptr, output_ptrs);
+    } else {
+      Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
+            d_segment_offsets_ptr, (__half*)input_ptr,  // NOLINT
+            (__half**)output_ptrs);                     // NOLINT
     }
   }
-
   return cudaGetLastError() != cudaSuccess;
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index b5b6e69992..6f028d3d72 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <thrust/device_vector.h>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
@@ -25,7 +26,7 @@ namespace plugin {
 class SplitPlugin : public PluginTensorRT {
  public:
   SplitPlugin(int axis, std::vector<int> const &output_lengths)
-      : axis_(axis), output_length_(output_lengths) {}
+      : axis_(axis), same_shape_(true), output_length_(output_lengths) {}
 
   SplitPlugin(void const *serial_data, size_t serial_length) {
     deserializeBase(serial_data, serial_length);
@@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT {
   }
 
   int axis_;
+  int outer_rows_;
+  int inner_cols_;
+  bool same_shape_;
   std::vector<int> output_length_;
-  int nx_, ny_, nz_;
   std::vector<int> segment_offsets_;
+  thrust::device_vector<int> d_segment_offsets_;
+  thrust::device_vector<float *> d_output_ptrs_;
 };
 
 }  // namespace plugin

From 6c0e09cb1d64a014873db47cae1eeca0264e561c Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 20 Nov 2018 16:52:24 +0800
Subject: [PATCH 62/80] change interpolate unittest to serial. test=develop

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 29e4ca04a7..46dd2ef110 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -75,10 +75,12 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
+list(REMOVE_ITEM TEST_OPS test_interpolate_op)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
+py_test_modules(test_interpolate_op MODULES test_interpolate_op SERIAL)
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)

From fd290c2580cfcaa5c80e41deb1d8fc6a4028099c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 21 Nov 2018 22:11:19 +0800
Subject: [PATCH 63/80] fix mac compile of analysis test=develop

---
 paddle/fluid/inference/analysis/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 0c73778b20..4bd3f93ef7 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -35,4 +35,4 @@ function(inference_analysis_test TARGET)
   endif()
 endfunction(inference_analysis_test)
 
-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS reset_tensor_array paddle_inference_api)

From 6eba5bd276a8d79d5611ec42db0c47273fb4950c Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Wed, 21 Nov 2018 15:32:25 +0000
Subject: [PATCH 64/80] Fix direct copy and refine split ut test=develop

---
 .../tensorrt/convert/test_split_op.cc         | 55 ++++++++++++++-----
 .../tensorrt/plugin/split_op_plugin.cu        |  7 ++-
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
index f81d011552..23909378dd 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -20,30 +20,59 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(split_op, test) {
+template <int BatchSize, int Axis>
+void TensorRTSplitTest(const std::vector<int> &in_shape,
+                       const std::vector<int> &sections) {
   std::unordered_set<std::string> parameters({""});
   framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2));
-  validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2));
-  validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2));
+  TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000);
+
+  auto make_dim = [](const std::vector<int> &shape) {
+    nvinfer1::DimsCHW dim;
+    dim.c() = shape[0];
+    dim.h() = shape[1];
+    dim.w() = shape[2];
+    return dim;
+  };
+  validator.DeclInputVar("split_input", make_dim(in_shape));
+  std::vector<std::string> output_vars;
+  for (size_t i = 0; i < sections.size(); ++i) {
+    auto out_shape = in_shape;
+    out_shape[Axis - 1] = sections[i];
+    std::string output_name = "split_out" + std::to_string(i);
+    validator.DeclOutputVar(output_name, make_dim(out_shape));
+    output_vars.push_back(output_name);
+  }
 
   // Prepare Op description
   framework::OpDesc desc;
   desc.SetType("split");
   desc.SetInput("X", {"split_input"});
-  desc.SetOutput("Out", {"split_out1", "split_out2"});
+  desc.SetOutput("Out", output_vars);
 
-  int num = 0;
-  int axis = 1;
-  std::vector<int> output_lengths = {2, 1};
-  desc.SetAttr("axis", axis);
-  desc.SetAttr("num", num);
-  desc.SetAttr("sections", output_lengths);
+  desc.SetAttr("axis", Axis);
+  desc.SetAttr("num", 0);
+  desc.SetAttr("sections", sections);
 
   validator.SetOp(*desc.Proto());
 
-  validator.Execute(1);
+  validator.Execute(BatchSize);
+}
+
+TEST(split_op, test_same_shape_batch1) {
+  TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2});
+}
+
+TEST(split_op, test_different_shape_batch1) {
+  TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1});
+}
+
+TEST(split_op, test_same_shape_batch10) {
+  TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2});
+}
+
+TEST(split_op, test_different_shape_batch10) {
+  TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1});
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 1ec0753e9f..de61ace59e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -138,11 +138,12 @@ inline void Split(cudaStream_t stream, const bool same_shape,
 int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                          void** outputs, void* workspace, cudaStream_t stream) {
   float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
-  if (axis_ == -1 && this->getNbOutputs() < 10) {
+  if (((batchSize == 1 && axis_ == 0) || axis_ == -1) &&
+      this->getNbOutputs() < 10) {
     float** output_ptrs = reinterpret_cast<float**>(outputs);
     int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT)
-                             ? sizeof(__half)
-                             : sizeof(float);
+                             ? sizeof(float)
+                             : sizeof(__half);
     for (int i = 0; i < this->getNbOutputs(); ++i) {
       PADDLE_ENFORCE(
           cudaMemcpyAsync(

From bba6224042603fe4d52821c4c1918cb8ce00ec32 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 22 Nov 2018 01:26:50 +0800
Subject: [PATCH 65/80] Add doc comments

test=develop
---
 tools/manylinux1/build_scripts/build_utils.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
index c1647ce244..d97745ad2d 100755
--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -53,6 +53,8 @@ function do_cpython_build {
     # NOTE --enable-shared for generating libpython shared library needed for
     # linking of some of the nupic.core test executables.
     if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
+        # NOTE python 3.7 should be installed via make altinstall rather than
+        # make install, and we should specify the location of ssl
         CFLAGS="-Wformat" ./configure --prefix=${prefix} --with-openssl=/usr/local/ssl --enable-shared $unicode_flags > /dev/null
         make -j8 > /dev/null
         make altinstall > /dev/null

From 3912545ffec3ea5a850420f0a804afadc9f0352a Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 22 Nov 2018 04:30:19 +0000
Subject: [PATCH 66/80] add dlpack support test=develop

---
 CMakeLists.txt                               |   1 +
 cmake/external/dlpack.cmake                  |  31 +++++
 paddle/fluid/framework/CMakeLists.txt        |   3 +
 paddle/fluid/framework/dlpack_tensor.cc      | 127 +++++++++++++++++++
 paddle/fluid/framework/dlpack_tensor.h       |  45 +++++++
 paddle/fluid/framework/dlpack_tensor_test.cc | 113 +++++++++++++++++
 6 files changed, 320 insertions(+)
 create mode 100644 cmake/external/dlpack.cmake
 create mode 100644 paddle/fluid/framework/dlpack_tensor.cc
 create mode 100644 paddle/fluid/framework/dlpack_tensor.h
 create mode 100644 paddle/fluid/framework/dlpack_tensor_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c62cc9bfd7..b6ae241272 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -190,6 +190,7 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 include(external/xxhash)    # download xxhash
+include(external/dlpack)
 
 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
new file mode 100644
index 0000000000..94d8fcc668
--- /dev/null
+++ b/cmake/external/dlpack.cmake
@@ -0,0 +1,31 @@
+include(ExternalProject)
+
+set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack)
+set(DLPACK_INCLUDE_DIR ${DLPACK_SOURCE_DIR}/src/extern_dlpack/include)
+
+include_directories(${DLPACK_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_dlpack
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/dmlc/dlpack.git"
+  GIT_TAG        "v0.2"
+  PREFIX         ${DLPACK_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/dlpack_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(dlpack STATIC ${dummyfile})
+else()
+  add_library(dlpack INTERFACE)
+endif()
+
+add_dependencies(dlpack extern_dlpack)
+
+LIST(APPEND externl_project_dependencies dlpack)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index cb9057672c..d7d7834b49 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -205,3 +205,6 @@ cc_test(tuple_test SRCS tuple_test.cc )
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
+
+cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
+cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
new file mode 100644
index 0000000000..04e3f78afe
--- /dev/null
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/dlpack_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+namespace internal {
+template <typename T>
+static ::DLDataType GetDLDataTypeCode() {
+  ::DLDataType dtype;
+  if (std::is_same<T, platform::float16>::value ||
+      std::is_floating_point<T>::value) {
+    dtype.code = kDLFloat;
+  } else if (std::is_unsigned<T>::value) {
+    dtype.code = kDLUInt;
+  } else if (std::is_integral<T>::value) {
+    dtype.code = kDLInt;
+  } else {
+    PADDLE_THROW("Unsupported data type %s", typeid(T).name());
+  }
+  dtype.bits = 8 * sizeof(T);
+  dtype.lanes = 1;
+  return dtype;
+}
+
+static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) {
+#define REG_DL_DATA_TYPE(type) \
+  { std::type_index(typeid(type)), GetDLDataTypeCode<type>() }
+  static const std::unordered_map<std::type_index, ::DLDataType>
+      type_to_dtype_map({
+          REG_DL_DATA_TYPE(platform::float16),  // NOLINT
+          REG_DL_DATA_TYPE(float),              // NOLINT
+          REG_DL_DATA_TYPE(double),             // NOLINT
+          REG_DL_DATA_TYPE(int),                // NOLINT
+          REG_DL_DATA_TYPE(int64_t),            // NOLINT
+          REG_DL_DATA_TYPE(bool),               // NOLINT
+          REG_DL_DATA_TYPE(size_t),             // NOLINT
+          REG_DL_DATA_TYPE(int16_t),            // NOLINT
+          REG_DL_DATA_TYPE(uint8_t),            // NOLINT
+          REG_DL_DATA_TYPE(int8_t)              // NOLINT
+      });
+  static auto type_to_dtype_map_end_it = type_to_dtype_map.end();
+  auto it = type_to_dtype_map.find(type);
+  PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s",
+                 type.name());
+  return it->second;
+#undef REG_DL_DATA_TYPE
+}
+
+struct DLContextVisitor : public boost::static_visitor<::DLContext> {
+  inline ::DLContext operator()(const platform::CPUPlace &place) const {
+    DLContext ctx;
+    ctx.device_type = kDLCPU;
+    ctx.device_id = 0;
+    return ctx;
+  }
+
+  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
+#ifdef PADDLE_WITH_CUDA
+    DLContext ctx;
+    ctx.device_type = kDLGPU;
+    ctx.device_id = place.device;
+    return ctx;
+#else
+    PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version");
+#endif
+  }
+
+  inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
+#ifdef PADDLE_WITH_CUDA
+    DLContext ctx;
+    ctx.device_type = kDLCPUPinned;
+    ctx.device_id = 0;
+    return ctx;
+#else
+    PADDLE_THROW(
+        "platform::CUDAPinnedPlace is not supported in CPU only version");
+#endif
+  }
+};
+}  // namespace internal
+
+DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
+  // init data, data buffer
+  t_.data = const_cast<void *>(tensor.data<void>());
+
+  // init ctx, DLContext type with device_type and device_id
+  auto place = tensor.place();
+  t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place);
+
+  // init dtype
+  t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type());
+  t_.dtype.lanes = lanes;
+
+  // init ndim, tensor rank
+  auto &dims = tensor.dims();
+  using DimType = decltype(t_.ndim);  // int
+  t_.ndim = static_cast<DimType>(dims.size());
+
+  // init shape, tensor dims
+  t_.shape = shape_;
+  for (DimType i = 0; i < t_.ndim; ++i) {
+    t_.shape[i] = dims[i];
+  }
+
+  // init strides, nullptr means the tensor is compact
+  t_.strides = nullptr;
+
+  // init byte_offset
+  t_.byte_offset = 0;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
new file mode 100644
index 0000000000..0c52bce1ef
--- /dev/null
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <dlpack/dlpack.h>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+class DLPackTensor {
+ public:
+  using LaneType = decltype(::DLTensor::dtype.lanes);  // uint16_t
+  using ShapeType =
+      std::remove_reference<decltype(::DLTensor::shape[0])>::type;  // int64_t
+
+  // lanes is only used in CPU to enable vectorization
+  explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1);
+
+  inline operator const ::DLTensor&() const { return t_; }
+
+  inline operator ::DLTensor&() { return t_; }
+
+ private:
+  ::DLTensor t_;
+
+  // The shape in DLTensor is defined as int64_t*
+  // Add this member to make TVMTensor init without heap allocation
+  ShapeType shape_[9];
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
new file mode 100644
index 0000000000..938b056350
--- /dev/null
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/dlpack_tensor.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+namespace {  // NOLINT
+template <typename T>
+constexpr uint8_t GetDLDataTypeCode() {
+  return std::is_same<platform::float16, T>::value ||
+                 std::is_floating_point<T>::value
+             ? static_cast<uint8_t>(kDLFloat)
+             : (std::is_unsigned<T>::value
+                    ? static_cast<uint8_t>(kDLUInt)
+                    : (std::is_integral<T>::value ? static_cast<uint8_t>(kDLInt)
+                                                  : static_cast<uint8_t>(-1)));
+}
+}  // NOLINT
+
+template <typename T>
+void TestMain(const platform::Place &place, uint16_t lanes) {
+  DDim dims{4, 5, 6, 7};
+  Tensor tensor;
+  tensor.Resize(dims);
+  void *p = tensor.mutable_data<T>(place);
+
+  DLPackTensor dlpack_tensor(tensor, lanes);
+  ::DLTensor &dl_tensor = dlpack_tensor;
+
+  CHECK_EQ(p, dl_tensor.data);
+  if (platform::is_cpu_place(place)) {
+    CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type);
+    CHECK_EQ(0, dl_tensor.ctx.device_id);
+  } else if (platform::is_gpu_place(place)) {
+    CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type);
+    CHECK_EQ(boost::get<platform::CUDAPlace>(place).device,
+             dl_tensor.ctx.device_id);
+  } else if (platform::is_cuda_pinned_place(place)) {
+    CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type);
+    CHECK_EQ(0, dl_tensor.ctx.device_id);
+  } else {
+    CHECK_EQ(false, true);
+  }
+
+  CHECK_EQ(dims.size(), dl_tensor.ndim);
+  for (auto i = 0; i < dims.size(); ++i) {
+    CHECK_EQ(dims[i], dl_tensor.shape[i]);
+  }
+
+  CHECK_EQ(dl_tensor.strides == nullptr, true);
+  CHECK_EQ(static_cast<uint64_t>(0), dl_tensor.byte_offset);
+
+  CHECK_EQ(lanes, dl_tensor.dtype.lanes);
+  CHECK_EQ(sizeof(T) * 8, dl_tensor.dtype.bits);
+
+  CHECK_EQ(GetDLDataTypeCode<T>(), dl_tensor.dtype.code);
+}
+
+template <typename T>
+void TestMainLoop() {
+#ifdef PADDLE_WITH_CUDA
+  std::vector<platform::Place> places{platform::CPUPlace(),
+                                      platform::CUDAPlace(0),
+                                      platform::CUDAPinnedPlace()};
+  if (platform::GetCUDADeviceCount() > 1) {
+    places.emplace_back(platform::CUDAPlace(1));
+  }
+#else
+  std::vector<platform::Place> places{platform::CPUPlace()};
+#endif
+  std::vector<uint16_t> lanes{1, 2};
+  for (auto &p : places) {
+    for (auto &l : lanes) {
+      TestMain<T>(p, l);
+    }
+  }
+}
+
+#define PADDLE_DLPACK_TEST(type) \
+  TEST(dlpack, test_##type) { TestMainLoop<type>(); }
+
+using float16 = platform::float16;
+PADDLE_DLPACK_TEST(float16);
+PADDLE_DLPACK_TEST(float);
+PADDLE_DLPACK_TEST(double);
+PADDLE_DLPACK_TEST(int);
+PADDLE_DLPACK_TEST(int64_t);
+PADDLE_DLPACK_TEST(bool);
+PADDLE_DLPACK_TEST(size_t);
+PADDLE_DLPACK_TEST(int16_t);
+PADDLE_DLPACK_TEST(uint8_t);
+PADDLE_DLPACK_TEST(int8_t);
+
+#undef PADDLE_DLPACK_TEST
+
+}  // namespace framework
+}  // namespace paddle

From 533c5d580369e9605e9f0080c26c337c25301c3b Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 22 Nov 2018 13:00:29 +0800
Subject: [PATCH 67/80] fix(Cpu): fix cpu compile and unittest

test=develop
---
 paddle/fluid/pybind/pybind.cc                      | 4 ++++
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5ef5bf4d6c..358340b897 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -46,6 +46,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@@ -95,6 +96,9 @@ bool IsCompiledWithDIST() {
 }
 
 PYBIND11_PLUGIN(core) {
+  // Not used, just make sure cpu_info.cc is linked.
+  paddle::platform::CpuTotalPhysicalMemory();
+
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
   py::module m("core", "C++ core of PaddlePaddle");
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 79fa99d002..4fa69191ad 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,7 +23,9 @@ if(NOT WITH_DISTRIBUTE)
     LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)
 
-if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+if (NOT ${WITH_GPU})
+    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
 endif()
 

From d9a1f3e58e89c3f3cc38fb8edc830ac38578c733 Mon Sep 17 00:00:00 2001
From: wopeizl <wopeizl@icloud.com>
Date: Thu, 22 Nov 2018 13:33:26 +0800
Subject: [PATCH 68/80] Windows/online  (#14474)

* add recordio support

* disable the openblas multi-thread on windows since no support
adjust the python script

* code style

* code style
test=develop

* add create_recordio_file_reader back

* fix code style
test=develop

* fix the gtest.cmake on windows

* fix cc_test on windows

* fix the win build
test=develop

* remove fused compile support on windows
test=develop

* add the jit support
test=develop

* add the jit support, test=develop

* add the jit support, test=develop

* add the jit back
fix compile error on windows

* rollback test=develop

* test case fix

* disable DSO by default on windows

* exclude warpctc_op on windows

* exclude the dynload_warpctc out on windows
test=develop

* fix the scripts error
test=develop

* disable avx on windows by default
test=develop

* re-organize the cmake file

* disable mkl on windows by default

* add warp_ctc back

* fix the dependency

* fix the dependency

* fix the build issue on windows

* remove unsupported flag on windows

* code style

* code style
test=develop

* fix issue

* add profiler, parallel_executor back

* clean up the pre-definitions on windows

* fix build issue

* test=develop
---
 CMakeLists.txt                                |  21 +-
 cmake/external/gtest.cmake                    |   4 +
 cmake/external/snappy.cmake                   |  12 +-
 cmake/external/snappystream.cmake             |  61 +--
 cmake/generic.cmake                           |   3 +
 cmake/operators.cmake                         |   4 +-
 cmake/simd.cmake                              |  73 ++--
 paddle/fluid/CMakeLists.txt                   |   6 +-
 paddle/fluid/framework/CMakeLists.txt         |  15 +-
 .../fast_threaded_ssa_graph_executor.h        |   2 +-
 paddle/fluid/framework/eigen.h                |   5 -
 paddle/fluid/framework/op_registry.h          |   5 -
 paddle/fluid/framework/operator.cc            |   2 -
 paddle/fluid/framework/operator.h             |   2 -
 paddle/fluid/inference/api/api_impl.h         |   6 -
 .../fluid/memory/allocation/cpu_allocator.h   |   6 +
 paddle/fluid/operators/CMakeLists.txt         |  12 +-
 .../fluid/operators/hierarchical_sigmoid_op.h |   2 +-
 paddle/fluid/operators/math/CMakeLists.txt    |  35 +-
 .../math/detail/activation_functions.h        |   1 +
 paddle/fluid/operators/math/matrix_bit_code.h |   3 +-
 .../operators/reader/create_py_reader_op.cc   |   2 +-
 paddle/fluid/operators/roi_align_op.cc        |   6 +-
 paddle/fluid/operators/roi_pool_op.cc         |   6 +-
 paddle/fluid/operators/space_to_depth_op.cc   |   2 +-
 paddle/fluid/platform/CMakeLists.txt          |  12 +-
 paddle/fluid/platform/cpu_helper.cc           |   7 +
 paddle/fluid/platform/device_tracer.h         |  12 +-
 paddle/fluid/platform/dynload/cudnn.h         |   2 -
 paddle/fluid/platform/enforce.h               |  70 +---
 paddle/fluid/platform/init.cc                 |   7 -
 paddle/fluid/platform/init.h                  |   3 -
 paddle/fluid/platform/port.h                  |  35 +-
 paddle/fluid/platform/profiler.cc             |   2 +-
 paddle/fluid/platform/profiler.h              |  10 -
 .../fluid/platform/stream_callback_manager.h  |  13 +-
 paddle/fluid/pybind/CMakeLists.txt            |   8 +-
 paddle/fluid/pybind/pybind.cc                 |  21 +-
 python/paddle/fluid/__init__.py               |   5 +-
 python/paddle/fluid/contrib/inferencer.py     |   4 +-
 python/paddle/fluid/contrib/trainer.py        |   3 +-
 python/paddle/fluid/layers/io.py              | 118 +++---
 python/paddle/fluid/layers/nn.py              | 368 +++++++++---------
 python/paddle/fluid/layers/ops.py             |  41 +-
 44 files changed, 483 insertions(+), 554 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c62cc9bfd7..bc2ac2cd93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,6 +130,21 @@ if (APPLE OR WIN32)
         "Disable MKL for building on mac and windows" FORCE)
 endif()
 
+if (WIN32)
+    set(WITH_AVX OFF CACHE STRING
+            "Disable AVX when compiling for Windows" FORCE)
+    set(WITH_DSO OFF CACHE STRING
+            "Disable DSO when compiling for Windows" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+            "Disable MKL when compiling for Windows" FORCE)
+    set(WITH_DISTRIBUTE OFF CACHE STRING
+            "Disable DISTRIBUTE when compiling for Windows" FORCE)
+    set(WITH_C_API OFF CACHE STRING
+            "Disable C_API when compiling for Windows" FORCE)
+    set(WITH_FLUID_ONLY ON CACHE STRING
+            "Enable FLUID_ONLY when compiling for Windows" FORCE)
+endif()
+
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
@@ -190,11 +205,11 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 include(external/xxhash)    # download xxhash
-
-if (NOT WIN32)
-# there is no official support of snappystream, warpctc, nccl, cupti in windows
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+
+if (NOT WIN32)
+# there is no official support of warpctc, nccl, cupti in windows
 include(external/warpctc)   # download, build, install warpctc
 include(cupti)
 endif (NOT WIN32)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index d335298742..4fe9c13fb7 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -50,7 +50,11 @@ IF(WITH_TESTING)
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                         -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                         -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                         -DBUILD_GMOCK=ON
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index af09ed4d5d..b30403d2d8 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -24,7 +24,11 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
 
-set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+if (WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
+else(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+endif (WIN32)
 
 ExternalProject_Add(
     extern_snappy
@@ -34,8 +38,12 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 6df636d7fa..1ec79462c1 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -18,36 +18,45 @@ ENDIF()
 
 include (ExternalProject)
 
-# NOTE: snappy is needed when linking with recordio
-
 set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
 set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
 set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
 
-set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
-
-ExternalProject_Add(
-        extern_snappystream
-        GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
-        GIT_TAG "0.2.8"
-        PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-                        CMAKE_CACHE_ARGS
-                        -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        DEPENDS snappy
-)
+if(WIN32)
+    # Fix me, VS2015 come without VLA support
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib")
+    MESSAGE(WARNING, "In windows, snappystream has no compile support for windows,
+    please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR})
+else(WIN32)
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+
+    ExternalProject_Add(
+            extern_snappystream
+            GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
+            GIT_TAG "0.2.8"
+            PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
+            UPDATE_COMMAND  ""
+            CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                            -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                            -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                            -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                            -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                            -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
+                            ${EXTERNAL_OPTIONAL_ARGS}
+                            CMAKE_CACHE_ARGS
+                            -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
+                            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+            DEPENDS snappy
+    )
+endif(WIN32)
 
 add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index e21f89c7c5..111627a932 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -351,6 +351,9 @@ function(cc_test TARGET_NAME)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
     target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    if(WIN32)
+      target_link_libraries(${TARGET_NAME} shlwapi)
+    endif(WIN32)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index ba9c266d13..17107e0698 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,9 +84,7 @@ function(op_library TARGET)
     endif()
     if (WIN32)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
-     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 566dc75fda..86096d4fea 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,43 +57,46 @@ int main()
     return 0;
 }" SSE3_FOUND)
 
-# Check AVX
-set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-    __m256 result = _mm256_add_ps (a, b);
-    return 0;
-}" AVX_FOUND)
+# disable AVX by default on windows
+if(NOT WIN32)
+    # Check AVX
+    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+        __m256 result = _mm256_add_ps (a, b);
+        return 0;
+    }" AVX_FOUND)
 
-# Check AVX 2
-set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-    __m256i result = _mm256_abs_epi32 (a);
-    return 0;
-}" AVX2_FOUND)
+    # Check AVX 2
+    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+        __m256i result = _mm256_abs_epi32 (a);
+        return 0;
+    }" AVX2_FOUND)
 
-# Check AVX512F
-set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                  13, -5, 6, -7, 9, 2, -6, 3);
-    __m512i result = _mm512_abs_epi32 (a);
-    return 0;
-}" AVX512F_FOUND)
+    # Check AVX512F
+    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                      13, -5, 6, -7, 9, 2, -6, 3);
+        __m512i result = _mm512_abs_epi32 (a);
+        return 0;
+    }" AVX512F_FOUND)
+endif(NOT WIN32)
 
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index abadda3adb..6b526f0103 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -3,13 +3,9 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(string)
-
-add_subdirectory(pybind)
-if (NOT WIN32)
 add_subdirectory(recordio)
-endif(NOT WIN32)
+add_subdirectory(pybind)
 
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-
 add_subdirectory(train)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index cb9057672c..43e1bc6b2e 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -31,9 +31,7 @@ function(windows_symbolic TARGET)
 endfunction()
 
 add_subdirectory(ir)
-if (NOT WIN32)
 add_subdirectory(details)
-endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
 
@@ -68,11 +66,7 @@ if(WITH_GPU)
 else()
   cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
-if (NOT WIN32)
-  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
-else()
-  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
-endif (NOT WIN32)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
 
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@@ -122,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 
-if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler)
-else()
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
-endif(NOT WIN32)
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 
@@ -183,12 +172,10 @@ else()
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 
-if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
         graph build_strategy
         fast_threaded_ssa_graph_executor)
-endif() # NOT WIN32
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index 949616f02d..c3a8b85423 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <ThreadPool.h>
 #include <string>
 #include <vector>
-#include "ThreadPool.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index 2b265a773f..5bafa4345f 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-// logging.h and windows.h conflict
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
 
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index ef2eb334a4..0e6e74293c 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,11 +23,6 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 
-#if defined(_WIN32)
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#endif
-
 #include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 2b35943d09..1ec170b6f6 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 6918e030bf..ef83833217 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -20,8 +20,6 @@ limitations under the License. */
 #include <tuple>
 #include <unordered_map>
 #include <vector>
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 4e4ab47ca9..9dfa48d501 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -14,12 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-// logging.h and windows.h conflict
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
-
 #include <glog/logging.h>
 #include <map>
 #include <memory>
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 9e0044c47a..26d3643f4e 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -15,6 +15,12 @@
 #pragma once
 #include "paddle/fluid/memory/allocation/allocator.h"
 
+#ifdef _WIN32
+#define posix_memalign_free _aligned_free
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#endif
+
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 81c9239486..de4f23515d 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE)
     add_subdirectory(distributed_ops)
 endif()
 
-if (NOT WIN32)
-    add_subdirectory(reader)
-endif()
+add_subdirectory(reader)
 
 if (NOT WIN32)
     add_subdirectory(nccl)
@@ -42,7 +40,7 @@ endif()
 register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})
 
 # warpctc_op needs cudnn 7 above
-if (WITH_GPU)
+if (WITH_GPU AND NOT WIN32)
     if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
     else()
@@ -59,10 +57,12 @@ endif()
 
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 if (NOT WIN32)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 endif()
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv)
 endif()
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 64096a717b..79980cda53 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -111,7 +111,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
     auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
     auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
     auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
+    Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
 
     // softrelu derivative
     pre_out_grad_mat.device(place) =
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 83ee9f6c51..63363086ad 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,6 +1,4 @@
-if (NOT WIN32)
-    add_subdirectory(detail)
-endif(NOT WIN32)
+add_subdirectory(detail)
 
 function(math_library TARGET)
     # math_library is a function to create math library.
@@ -43,10 +41,8 @@ math_library(depthwise_conv)
 math_library(im2col)
 math_library(sampler)
 
-if (NOT WIN32) # windows do not support avx functions yet.
-    math_library(gru_compute DEPS activation_functions math_function)
-    math_library(lstm_compute DEPS activation_functions)
-endif (NOT WIN32)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(lstm_compute DEPS activation_functions)
 
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
@@ -58,9 +54,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
-if (NOT WIN32)
-    math_library(matrix_bit_code)
-endif (NOT WIN32)
+
+math_library(matrix_bit_code)
+
 math_library(unpooling)
 math_library(vol2col)
 
@@ -76,13 +72,12 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-if (NOT WIN32)
-    set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
-    set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
-    if(WITH_XBYAK)
-        list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
-        list(APPEND JIT_KERNEL_DEPS xbyak)
-    endif()
-    cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
-    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
-endif (NOT WIN32)
+
+set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
+if(WITH_XBYAK)
+    list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
+    list(APPEND JIT_KERNEL_DEPS xbyak)
+endif()
+cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
+cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
index b127fbe8c8..2b3d38d95a 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 07854c8358..c329b8b611 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -67,7 +67,7 @@ inline constexpr size_t FindLastSet(size_t x) {
              : (std::is_same<size_t, unsigned long>::value  // NOLINT
                     ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
                     : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
-
+}
 #else
 // windows don't have built-in clz, ctz function
 template <typename T>
@@ -92,7 +92,6 @@ inline int clz(const T& value) {
 
 inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
-}
 
 struct SimpleCode {
   SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index 0f31ca1a94..901a92ab5b 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -74,7 +74,7 @@ class CreatePyReaderOpMaker : public FileReaderMakerBase {
              "Name of the `LoDTensorBlockingQueueHolder` variable");
 
     AddComment(R"DOC(
-			Create PyReader to support LoDTensor data feeding in Python side.
+      Create PyReader to support LoDTensor data feeding in Python side.
       )DOC");
   }
 };
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index c57a34c3a7..79f189222e 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -35,10 +35,10 @@ class ROIAlignOp : public framework::OperatorWithKernel {
                    "The format of input tensor is NCHW.");
     PADDLE_ENFORCE(rois_dims.size() == 2,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
     PADDLE_ENFORCE(rois_dims[1] == 4,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
     float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
@@ -103,7 +103,7 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor), "
              "ROIs (Regions of Interest) to pool over. "
              "should be a 2-D LoDTensor of shape (num_rois, 4)"
-             "given as [[x1, y1, x2, y2], …]. "
+             "given as [[x1, y1, x2, y2], ...]. "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
     AddOutput("Out",
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 043ea680d1..3f6b2e46c7 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -40,10 +40,10 @@ class ROIPoolOp : public framework::OperatorWithKernel {
                    "The format of input tensor is NCHW.");
     PADDLE_ENFORCE(rois_dims.size() == 2,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
     PADDLE_ENFORCE(rois_dims[1] == kROISize,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
 
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
@@ -110,7 +110,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor), "
              "ROIs (Regions of Interest) to pool over. "
              "should be a 2-D LoDTensor of shape (num_rois, 4)"
-             "given as [[x1, y1, x2, y2], …]. "
+             "given as [[x1, y1, x2, y2], ...]. "
              "Where batch_id is the id of the data, "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index c047bc78ee..b579244673 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -86,7 +86,7 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
         .GreaterThan(1);
     AddComment(R"DOC(
         reorg operator used in Yolo v2.
-        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,   
+        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,
 
         Reshape Input(X) into the shape according to Attr(blocksize). The
         data in Input(X) are unchanged.
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 0d0613e1a4..93cb5eb2dc 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,4 +1,3 @@
-if (NOT WIN32)
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)
 
@@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
 
 add_dependencies(profiler_py_proto profiler_py_proto_init)
 
+if (NOT WIN32)
 add_custom_command(TARGET profiler_py_proto POST_BUILD
         COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
         COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
         COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+else(NOT WIN32)
+string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
+add_custom_command(TARGET profiler_py_proto POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND copy /Y *.py ${proto_dstpath}
+        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)
 
 if(WITH_GPU)
@@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
-
-if (NOT WIN32)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-endif(NOT WIN32)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 234a04b5c2..f2d691b293 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -29,6 +29,13 @@ namespace platform {
 
 void SetNumThreads(int num_threads) {
 #ifdef PADDLE_USE_OPENBLAS
+// windows has no support for openblas multi-thread
+// please refer to: https://github.com/PaddlePaddle/Paddle/issues/7234
+#ifdef _WIN32
+  if (num_threads > 1) {
+    num_threads = 1;
+  }
+#endif
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   openblas_set_num_threads(real_num_threads);
 #elif defined(PADDLE_WITH_MKLML)
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index f59fc40b71..eaf047d474 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -13,17 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#if !defined(_WIN32)
-#include <sys/time.h>
-#else
-#include <windows.h>
-#endif  // !_WIN32
-
-#include <time.h>
 #include <chrono>  // NOLINT
 #include <string>
 
 #include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 
 namespace paddle {
@@ -32,15 +26,11 @@ namespace platform {
 ///////////////////////
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
-#if !defined(_WIN32)
 inline uint64_t PosixInNsec() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
-#else
-inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
-#endif  // !_WIN32
 
 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 065b940b9c..1a83ac7780 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
 #include <glog/logging.h>
 
 #include <cudnn.h>
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a251bfcd99..a85972bdb7 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -18,12 +18,6 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
 
-#if defined(_WIN32)
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#endif
-
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -127,14 +121,14 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #else
 // there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition == 0)
+#define UNLIKELY(condition) (condition)
 #endif
 
 #if !defined(_WIN32)
 #define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
 #else
 // there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) (condition != 0)
+#define LIKELY(condition) (condition)
 #endif
 
 template <typename... Args>
@@ -248,7 +242,6 @@ inline void throw_on_error(T e) {
   throw_on_error(e, "");
 }
 
-#if !defined(_WIN32)
 #define PADDLE_THROW(...)                                              \
   do {                                                                 \
     throw ::paddle::platform::EnforceNotMet(                           \
@@ -272,17 +265,6 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG
 
-#else  // !_WIN32
-// disable enforce, caused by the varardic macro exception error
-#define PADDLE_THROW(x)                                      \
-  do {                                                       \
-    throw std::make_exception_ptr(                           \
-        std::runtime_error("Windows disable the enforce.")); \
-  } while (false)
-
-#define PADDLE_ENFORCE(x, ...) x
-#endif  // !_WIN32
-
 #define PADDLE_THROW_EOF()                                                     \
   do {                                                                         \
     throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
@@ -302,20 +284,6 @@ inline void throw_on_error(T e) {
  *    extra messages is also supported, for example:
  *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
  */
-#if !defined(_WIN32)
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-
 #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
   do {                                                       \
     if (UNLIKELY(nullptr == (__VAL))) {                      \
@@ -335,27 +303,19 @@ inline void throw_on_error(T e) {
                    paddle::string::Sprintf("" __VA_ARGS__));            \
     }                                                                   \
   } while (0)
-#else
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
-
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
-  do {                                                                 \
-    if (!((__VAL0)__CMP(__VAL1))) {                                    \
-      PADDLE_THROW("Windows disable the enforce. Enforce failed.");    \
-    }                                                                  \
-  } while (0)
-#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...)                       \
-  do {                                                             \
-    if (nullptr == (__VAL1)) {                                     \
-      PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
-    }                                                              \
-  } while (0)
-#endif  // !_WIN32
+
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index e07e9d3825..0ccef6c6a8 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -117,13 +117,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
 
-// windows has no support for openblas multi-thread
-#ifdef _WIN32
-  if (FLAGS_paddle_num_threads > 1) {
-    FLAGS_paddle_num_threads = 1;
-  }
-#endif
-
 #ifndef PADDLE_WITH_MKLDNN
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h
index 992ca5e6f6..0e30594672 100644
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -16,9 +16,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
-
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index 8823e97b0b..ad070171df 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -17,6 +17,7 @@
 #include <cstdio>
 #include <stdexcept>
 
+#include <time.h>
 #include <memory>
 #include <string>
 
@@ -27,8 +28,13 @@
 #include <dlfcn.h>     //  dladdr
 #include <execinfo.h>  // backtrace
 #include <sys/stat.h>
+#include <sys/time.h>
 #include <algorithm>  // std::accumulate
 #else
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+// solve static linking error in windows
+// https://github.com/google/glog/issues/301
+#define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
 #include <windows.h>
@@ -57,6 +63,25 @@ static void *dlopen(const char *filename, int flag) {
   return reinterpret_cast<void *>(hModule);
 }
 
+static int gettimeofday(struct timeval *tp, void *tzp) {
+  time_t clock;
+  struct tm tm;
+  SYSTEMTIME wtm;
+
+  GetLocalTime(&wtm);
+  tm.tm_year = wtm.wYear - 1900;
+  tm.tm_mon = wtm.wMonth - 1;
+  tm.tm_mday = wtm.wDay;
+  tm.tm_hour = wtm.wHour;
+  tm.tm_min = wtm.wMinute;
+  tm.tm_sec = wtm.wSecond;
+  tm.tm_isdst = -1;
+  clock = mktime(&tm);
+  tp->tv_sec = clock;
+  tp->tv_usec = wtm.wMilliseconds * 1000;
+
+  return (0);
+}
 #endif  // !_WIN32
 
 static void ExecShellCommand(const std::string &cmd, std::string *message) {
@@ -132,10 +157,12 @@ static void MkDir(const char *path) {
     }
   }
 #else
-  CreateDirectory(path, NULL);
-  auto errorno = GetLastError();
-  if (errorno != ERROR_ALREADY_EXISTS) {
-    throw std::runtime_error(path_error);
+  BOOL return_value = CreateDirectory(path, NULL);
+  if (!return_value) {
+    auto errorno = GetLastError();
+    if (errorno != ERROR_ALREADY_EXISTS) {
+      throw std::runtime_error(path_error);
+    }
   }
 #endif  // !_WIN32
 }
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 56bf9e31a3..998242fb4a 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/port.h"
 
-#include <sys/time.h>
 #include <algorithm>
 #include <iomanip>
 #include <limits>
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index e8eae874af..f5d3490634 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
 
-#if !defined(_WIN32)
 struct RecordEvent {
   // dev_ctx can be set to nullptr if device is cpu.
   RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
@@ -106,15 +105,6 @@ struct RecordBlock {
   std::string name_;
   uint64_t start_ns_;
 };
-#else
-// windows do not support profiler temporarily.
-struct RecordEvent {
-  RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
-};
-struct RecordBlock {
-  explicit RecordBlock(int block_id) {}
-};
-#endif
 
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 0e88a439cf..11c68f3449 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -45,16 +45,15 @@ class StreamCallbackManager {
   inline void AddCallback(Callback &&callback) const {
     auto *stream_callback_context =
         new StreamCallbackContext(this, std::forward<Callback>(callback));
-    PADDLE_ENFORCE(
 #if CUDA_VERSION >= 10000
-        cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc,
-                           stream_callback_context)
+    PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
+                                      StreamCallbackManager::StreamCallbackFunc,
+                                      stream_callback_context));  // NOLINT
 #else
-        cudaStreamAddCallback(stream_,
-                              StreamCallbackManager::StreamCallbackFunc,
-                              stream_callback_context, 0)
+    PADDLE_ENFORCE(cudaStreamAddCallback(
+        stream_, StreamCallbackManager::StreamCallbackFunc,
+        stream_callback_context, 0));  // NOLINT
 #endif
-            );  // NOLINT
   }
 
   void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 6417da077e..fb6ee2f4a5 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,10 +1,6 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method pass_builder)
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
-if(NOT WIN32)
-  list(APPEND PYBIND_DEPS parallel_executor profiler)
-  list(APPEND PYBIND_SRCS recordio.cc)
-endif(NOT WIN32)
+set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5ef5bf4d6c..6cc3a1739a 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,13 +21,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#if defined(_WIN32)
-#define NOMINMAX
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#include <Windows.h>
-#endif
-
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -36,9 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#ifndef _WIN32
 #include "paddle/fluid/framework/parallel_executor.h"
-#endif
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -359,22 +350,16 @@ All parameter, weight, gradient are variables in Paddle.
              return self.GetMutable<platform::Communicator>();
            },
            py::return_value_policy::reference)
-
 #endif
-#ifndef _WIN32
       .def("get_reader",
            [](Variable &self) -> framework::ReaderHolder * {
              PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
              return self.GetMutable<framework::ReaderHolder>();
            },
-           py::return_value_policy::reference)
-#endif
-      ;  // NOLINT
+           py::return_value_policy::reference);
 
-#if !defined(_WIN32)
   py::class_<framework::ReaderHolder>(m, "Reader", "")
       .def("reset", &framework::ReaderHolder::ResetAll);
-#endif
 
   using LoDTensorBlockingQueue =
       ::paddle::operators::reader::LoDTensorBlockingQueue;
@@ -643,7 +628,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif
 
-#ifndef _WIN32
   py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
       .value("kDisabled", platform::ProfilerState::kDisabled)
       .value("kCPU", platform::ProfilerState::kCPU)
@@ -664,7 +648,6 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
-#endif
 
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
@@ -693,7 +676,6 @@ All parameter, weight, gradient are variables in Paddle.
       .def("remove_pass",
            [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
 
-#ifndef _WIN32
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
@@ -921,7 +903,6 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   BindRecordIOWriter(&m);
-#endif
   return m.ptr();
 }
 }  // namespace pybind
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index f2f49f813a..543acf2d34 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -115,9 +115,8 @@ def __bootstrap__():
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
         'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb',
-        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir'
+        "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy',
+        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
     ]
     if os.name != 'nt':
         read_env_flags.append('warpctc_dir')
diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py
index b966ae01d0..b8d5f4ffea 100644
--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -15,15 +15,13 @@
 from __future__ import print_function
 
 import contextlib
-import os
 
 from .. import core
 
 from .. import executor
 from .. import framework
 from .. import io
-if os.name != 'nt':
-    from .. import parallel_executor
+from .. import parallel_executor
 from .. import unique_name
 from .trainer import check_and_get_place
 
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
index 096821a5ba..8569e486f9 100644
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -28,8 +28,7 @@ from .. import framework
 from .. import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 from .. import optimizer as opt_module
-if os.name != 'nt':
-    from .. import parallel_executor
+from .. import parallel_executor
 from ..transpiler import distribute_transpiler
 
 __all__ = [
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index a9075045a2..3f47053961 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -347,72 +347,70 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
-if os.name != 'nt':
-
-    @templatedoc(op_type='create_recordio_file_reader')
-    def open_recordio_file(filename,
-                           shapes,
-                           lod_levels,
-                           dtypes,
-                           pass_num=1,
-                           for_parallel=True):
-        """
-        ${comment}
-
-        Args:
-           filename(${filename_type}): ${filename_comment}.
-           shapes(list): List of tuples which declaring data shapes.
-           lod_levels(${lod_levels_type}): ${lod_levels_comment}.
-           dtypes(list): List of strs which declaring data type.
-           pass_num(int): Number of passes to run.
-           for_parallel(Bool): Set it as True if you are going to run
-                subsequent operators in parallel.
-
-        Returns:
-           ${out_comment}.
-
-        Examples:
-
-            >>> import paddle.fluid as fluid
-            >>> reader = fluid.layers.io.open_recordio_file(
-            >>>                               filename='./data.recordio',
-            >>>                               shapes=[(3,224,224), (1)],
-            >>>                               lod_levels=[0, 0],
-            >>>                               dtypes=['float32', 'int64'])
-            >>> # Via the reader, we can use 'read_file' layer to get data:
-            >>> image, label = fluid.layers.io.read_file(reader)
-        """
-        dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
-        shape_concat = []
-        ranks = []
+@templatedoc(op_type='create_recordio_file_reader')
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=True):
+    """
+    ${comment}
 
-        for shape in shapes:
-            shape_concat.extend(shape)
-            ranks.append(len(shape))
+    Args:
+       filename(${filename_type}): ${filename_comment}.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
+       dtypes(list): List of strs which declaring data type.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
 
-        var_name = unique_name('open_recordio_file')
+    Returns:
+       ${out_comment}.
 
-        startup_blk = default_startup_program().current_block()
-        startup_var = startup_blk.create_var(name=var_name)
-        startup_blk.append_op(
-            type='create_recordio_file_reader',
-            outputs={'Out': [startup_var]},
-            attrs={
-                'shape_concat': shape_concat,
-                'lod_levels': lod_levels,
-                'filename': filename,
-                'ranks': ranks
-            })
+    Examples:
 
-        startup_var.desc.set_dtypes(dtypes)
-        startup_var.persistable = True
-        main_prog_var = _copy_reader_var_(
-            default_main_program().current_block(), startup_var)
+        >>> import paddle.fluid as fluid
+        >>> reader = fluid.layers.io.open_recordio_file(
+        >>>                               filename='./data.recordio',
+        >>>                               shapes=[(3,224,224), (1)],
+        >>>                               lod_levels=[0, 0],
+        >>>                               dtypes=['float32', 'int64'])
+        >>> # Via the reader, we can use 'read_file' layer to get data:
+        >>> image, label = fluid.layers.io.read_file(reader)
+    """
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
 
-        if pass_num > 1:
-            main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    var_name = unique_name('open_recordio_file')
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startup_blk.append_op(
+        type='create_recordio_file_reader',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'filename': filename,
+            'ranks': ranks
+        })
 
-        return monkey_patch_reader_methods(main_prog_var)
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    if pass_num > 1:
+        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+
+    return monkey_patch_reader_methods(main_prog_var)
 
 
 def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7b0a3e2c82..e0cc09a4c7 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -343,128 +343,126 @@ def embedding(input,
     return tmp
 
 
-if os.name != 'nt':
+@templatedoc(op_type="lstm")
+def dynamic_lstm(input,
+                 size,
+                 h_0=None,
+                 c_0=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 dtype='float32',
+                 name=None):
+    """
+    ${comment}
 
-    @templatedoc(op_type="lstm")
-    def dynamic_lstm(input,
-                     size,
-                     h_0=None,
-                     c_0=None,
-                     param_attr=None,
-                     bias_attr=None,
-                     use_peepholes=True,
-                     is_reverse=False,
-                     gate_activation='sigmoid',
-                     cell_activation='tanh',
-                     candidate_activation='tanh',
-                     dtype='float32',
-                     name=None):
-        """
-        ${comment}
-
-        Args:
-            input (Variable): ${input_comment}
-            size (int): 4 * hidden size.
-            h_0(Variable): The initial hidden state is an optional input, default is zero.
-                           This is a tensor with shape (N x D), where N is the
-                           batch size and D is the hidden size.
-            c_0(Variable): The initial cell state is an optional input, default is zero.
-                           This is a tensor with shape (N x D), where N is the
-                           batch size. `h_0` and `c_0` can be NULL but only at the same time.
-            param_attr(ParamAttr|None): The parameter attribute for the learnable
-                                   hidden-hidden weights.
-
-                                   - Weights = {:math:`W_{ch}, W_{ih}, \
-                                                    W_{fh}, W_{oh}`}
-                                   - The shape is (D x 4D), where D is the hidden
-                                     size.
-
-                                   If it is set to None or one attribute of ParamAttr,
-                                   dynamic_lstm will create ParamAttr as param_attr.
-                                   If the Initializer of the param_attr is not set, the
-                                   parameter is initialized with Xavier. Default: None.
-            bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-                                  weights, which contains two parts, input-hidden
-                                  bias weights and peephole connections weights if
-                                  setting `use_peepholes` to `True`.
-
-                                  1. `use_peepholes = False`
-                                     - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                     - The shape is (1 x 4D).
-                                  2. `use_peepholes = True`
-                                     - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                                     W_{fc}, W_{oc}`}.
-                                     - The shape is (1 x 7D).
-
-                                  If it is set to None or one attribute of ParamAttr,
-                                  dynamic_lstm will create ParamAttr as bias_attr.
-                                  If the Initializer of the bias_attr is not set,
-                                  the bias is initialized zero. Default: None.
-            use_peepholes (bool): ${use_peepholes_comment}
-            is_reverse (bool): ${is_reverse_comment}
-            gate_activation (str): ${gate_activation_comment}
-            cell_activation (str): ${cell_activation_comment}
-            candidate_activation (str): ${candidate_activation_comment}
-            dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
-            name (str|None): A name for this layer(optional). If set None, the layer
-                             will be named automatically.
-
-        Returns:
-            tuple: The hidden state, and cell state of LSTM. The shape of both \
-            is (T x D), and lod is the same with the `input`.
-
-        Examples:
-            .. code-block:: python
-
-                hidden_dim = 512
-                forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
-                                               bias_attr=False)
-                forward, _ = fluid.layers.dynamic_lstm(
-                    input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
-        """
-        assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-        helper = LayerHelper('lstm', **locals())
-        size = size // 4
-        weight = helper.create_parameter(
-            attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-        bias_size = [1, 7 * size]
-        if not use_peepholes:
-            bias_size[1] = 4 * size
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+    Args:
+        input (Variable): ${input_comment}
+        size (int): 4 * hidden size.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the hidden size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                               hidden-hidden weights.
 
-        hidden = helper.create_variable_for_type_inference(dtype)
-        cell = helper.create_variable_for_type_inference(dtype)
-        batch_gate = helper.create_variable_for_type_inference(dtype)
-        batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-        inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-        batch_size = input.shape[0]
-        if h_0:
-            assert h_0.shape == (batch_size, size), \
-                'The shape of h0 should be (batch_size, %d)' % size
-            inputs['H0'] = h_0
-        if c_0:
-            assert c_0.shape == (batch_size, size), \
-                'The shape of c0 should be (batch_size, %d)' % size
-            inputs['C0'] = c_0
+                               - Weights = {:math:`W_{ch}, W_{ih}, \
+                                                W_{fh}, W_{oh}`}
+                               - The shape is (D x 4D), where D is the hidden
+                                 size.
 
-        helper.append_op(
-            type='lstm',
-            inputs=inputs,
-            outputs={
-                'Hidden': hidden,
-                'Cell': cell,
-                'BatchGate': batch_gate,
-                'BatchCellPreAct': batch_cell_pre_act
-            },
-            attrs={
-                'use_peepholes': use_peepholes,
-                'is_reverse': is_reverse,
-                'gate_activation': gate_activation,
-                'cell_activation': cell_activation,
-                'candidate_activation': candidate_activation
-            })
-        return hidden, cell
+                               If it is set to None or one attribute of ParamAttr,
+                               dynamic_lstm will create ParamAttr as param_attr.
+                               If the Initializer of the param_attr is not set, the
+                               parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+
+                              1. `use_peepholes = False`
+                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                 - The shape is (1 x 4D).
+                              2. `use_peepholes = True`
+                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+                                 - The shape is (1 x 7D).
+
+                              If it is set to None or one attribute of ParamAttr,
+                              dynamic_lstm will create ParamAttr as bias_attr.
+                              If the Initializer of the bias_attr is not set,
+                              the bias is initialized zero. Default: None.
+        use_peepholes (bool): ${use_peepholes_comment}
+        is_reverse (bool): ${is_reverse_comment}
+        gate_activation (str): ${gate_activation_comment}
+        cell_activation (str): ${cell_activation_comment}
+        candidate_activation (str): ${candidate_activation_comment}
+        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
+
+    Returns:
+        tuple: The hidden state, and cell state of LSTM. The shape of both \
+        is (T x D), and lod is the same with the `input`.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim = 512
+            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                           bias_attr=False)
+            forward, _ = fluid.layers.dynamic_lstm(
+                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+    """
+    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+    helper = LayerHelper('lstm', **locals())
+    size = size // 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    hidden = helper.create_variable_for_type_inference(dtype)
+    cell = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
+
+    helper.append_op(
+        type='lstm',
+        inputs=inputs,
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
 
 
 def dynamic_lstmp(input,
@@ -963,43 +961,39 @@ def linear_chain_crf(input, label, param_attr=None):
     return log_likelihood
 
 
-if os.name != 'nt':
-
-    @templatedoc()
-    def crf_decoding(input, param_attr, label=None):
-        """
-        ${comment}
+@templatedoc()
+def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}
 
-        Args:
-            input(${emission_type}): ${emission_comment}
+    Args:
+        input(${emission_type}): ${emission_comment}
 
-            param_attr(ParamAttr): The parameter attribute for training.
+        param_attr(ParamAttr): The parameter attribute for training.
 
-            label(${label_type}): ${label_comment}
+        label(${label_type}): ${label_comment}
 
-        Returns:
-            Variable: ${viterbi_path_comment}
+    Returns:
+        Variable: ${viterbi_path_comment}
 
-        Examples:
-            .. code-block:: python
+    Examples:
+        .. code-block:: python
 
-               crf_decode = layers.crf_decoding(
-                    input=hidden, param_attr=ParamAttr(name="crfw"))
-        """
-        helper = LayerHelper('crf_decoding', **locals())
-        transition = helper.get_parameter(param_attr.name)
-        viterbi_path = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
-        helper.append_op(
-            type='crf_decoding',
-            inputs={
-                "Emission": [input],
+           crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+    """
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
                 "Transition": transition,
-                "Label": label
-            },
-            outputs={"ViterbiPath": [viterbi_path]})
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
 
-        return viterbi_path
+    return viterbi_path
 
 
 @templatedoc()
@@ -5593,48 +5587,42 @@ def label_smooth(label,
     return smooth_label
 
 
-if os.name != 'nt':
-
-    @templatedoc()
-    def roi_pool(input,
-                 rois,
-                 pooled_height=1,
-                 pooled_width=1,
-                 spatial_scale=1.0):
-        """
-        ${comment}
-
-        Args:
-            input (Variable): ${x_comment}
-            rois (Variable): ROIs (Regions of Interest) to pool over.
-            pooled_height (integer): ${pooled_height_comment} Default: 1
-            pooled_width (integer): ${pooled_width_comment} Default: 1
-            spatial_scale (float): ${spatial_scale_comment} Default: 1.0
-
-        Returns:
-            Variable: ${out_comment}.
-
-        Examples:
-            .. code-block:: python
-
-                pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
-        """
-        helper = LayerHelper('roi_pool', **locals())
-        dtype = helper.input_dtype()
-        pool_out = helper.create_variable_for_type_inference(dtype)
-        argmaxes = helper.create_variable_for_type_inference(dtype='int32')
-        helper.append_op(
-            type="roi_pool",
-            inputs={"X": input,
-                    "ROIs": rois},
-            outputs={"Out": pool_out,
-                     "Argmax": argmaxes},
-            attrs={
-                "pooled_height": pooled_height,
-                "pooled_width": pooled_width,
-                "spatial_scale": spatial_scale
-            })
-        return pool_out
+@templatedoc()
+def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+
+    Returns:
+        Variable: ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
+    """
+    helper = LayerHelper('roi_pool', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+    helper.append_op(
+        type="roi_pool",
+        inputs={"X": input,
+                "ROIs": rois},
+        outputs={"Out": pool_out,
+                 "Argmax": argmaxes},
+        attrs={
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "spatial_scale": spatial_scale
+        })
+    return pool_out
 
 
 @templatedoc()
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 66eb1229aa..6c18af7283 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -100,26 +100,27 @@ Examples:
     >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """
 
-if os.name != 'nt':
-    __all__ += ['cumsum']
-
-    _cum_sum_ = generate_layer_fn('cumsum')
-
-    def cumsum(x, axis=None, exclusive=None, reverse=None):
-        locals_var = locals().keys()
-        kwargs = dict()
-        for name in locals_var:
-            val = locals()[name]
-            if val is not None:
-                kwargs[name] = val
-        return _cum_sum_(**kwargs)
-
-    cumsum.__doc__ = _cum_sum_.__doc__ + """
-    Examples:
-    
-        >>> data = fluid.layers.data(name="input", shape=[32, 784])
-        >>> result = fluid.layers.cumsum(data, axis=0)
-    """
+__all__ += ['cumsum']
+
+_cum_sum_ = generate_layer_fn('cumsum')
+
+
+def cumsum(x, axis=None, exclusive=None, reverse=None):
+    locals_var = locals().keys()
+    kwargs = dict()
+    for name in locals_var:
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _cum_sum_(**kwargs)
+
+
+cumsum.__doc__ = _cum_sum_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[32, 784])
+    >>> result = fluid.layers.cumsum(data, axis=0)
+"""
 
 __all__ += ['thresholded_relu']
 

From 982e48922020e8d5f3ddcfc682068fcbdc5b7fe2 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 22 Nov 2018 06:26:04 +0000
Subject: [PATCH 69/80] test=develop

---
 python/paddle/fluid/layers/nn.py                   | 5 +++--
 python/paddle/fluid/tests/unittests/test_layers.py | 6 ++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 99acd7e308..32d411b830 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2139,8 +2139,9 @@ def pool2d(input,
                           input tensor is NCHW, where N is batch size, C is
                           the number of channels, H is the height of the
                           feature, and W is the width of the feature.
-        pool_size (int): The side length of pooling windows. All pooling
-                         windows are squares with pool_size on a side.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
         pool_type: ${pooling_type_comment}
         pool_stride (int): stride of the pooling layer.
         pool_padding (int): padding size.
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index a8fa5436c4..c4310fe006 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -202,6 +202,12 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.sequence_unpad(x=x, length=length))
         print(str(program))
 
+    def test_pool2d(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
+            self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3]))
+
     def test_lstm_unit(self):
         program = Program()
         with program_guard(program):

From 1adda8e06c075d55edcc6aa50804eab62b903f72 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Thu, 22 Nov 2018 06:53:16 +0000
Subject: [PATCH 70/80] Add more unit tests for split plugin test=develop

---
 .../inference/tensorrt/convert/split_op.cc    | 13 ++---
 .../tensorrt/convert/test_split_op.cc         | 47 ++++++++++++++++---
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 871354267e..ae5b1b9806 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -19,9 +19,6 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-/*
- * SplitOp.
- */
 class SplitOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
@@ -40,15 +37,11 @@ class SplitOpConverter : public OpConverter {
     int axis = boost::get<int>(op_desc.GetAttr("axis"));
     std::vector<int> output_lengths =
         boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
-    // PADDLE_ENFORCE(axis != 0);
-    if (axis < 0) {
-      axis += input_dims.nbDims;
-    } else {
-      axis -= 1;
-    }
+    // split on batch is not supported in TensorRT
+    PADDLE_ENFORCE(axis != 0);
+    axis += (axis < 0) ? input_dims.nbDims : -1;
 
     PADDLE_ENFORCE(output_lengths.size() == output_num);
-    //
     plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
     nvinfer1::IPluginLayer* layer =
         engine_->AddPlugin(&input, input_num, plugin);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
index 23909378dd..5aacc5c600 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -59,21 +59,54 @@ void TensorRTSplitTest(const std::vector<int> &in_shape,
   validator.Execute(BatchSize);
 }
 
-TEST(split_op, test_same_shape_batch1) {
+// batch = 0, axis = 1, same shape
+TEST(split_op, test_same_shape_axis1_batch1) {
   TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2});
 }
-
-TEST(split_op, test_different_shape_batch1) {
+// batch = 0, axis = 1, different shape
+TEST(split_op, test_different_shape_axis1_batch1) {
   TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1});
 }
-
-TEST(split_op, test_same_shape_batch10) {
+// batch = 10, axis = 1, same shape
+TEST(split_op, test_same_shape_axis1_batch10) {
   TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2});
 }
-
-TEST(split_op, test_different_shape_batch10) {
+// batch = 10, axis = 1, different shape
+TEST(split_op, test_different_shape_axis1_batch10) {
   TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1});
 }
+// batch = 0, axis = 2, same shape
+TEST(split_op, test_same_shape_axis2_batch1) {
+  TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2});
+}
+// batch = 0, axis = 2, different shape
+TEST(split_op, test_different_shape_axis2_batch1) {
+  TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1});
+}
+// batch = 10, axis = 2, same shape
+TEST(split_op, test_same_shape_axis2_batch10) {
+  TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2});
+}
+// batch = 10, axis = 2, different shape
+TEST(split_op, test_different_shape_axis2_batch10) {
+  TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1});
+}
+// batch = 0, axis = 3, same shape
+TEST(split_op, test_same_shape_axis3_batch1) {
+  TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2});
+}
+// batch = 0, axis = 3, different shape
+TEST(split_op, test_different_shape_axis3_batch1) {
+  TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1});
+}
+// batch = 10, axis = 3, same shape
+TEST(split_op, test_same_shape_axis3_batch10) {
+  TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2});
+}
+// batch = 10, axis = 3, different shape
+TEST(split_op, test_different_shape_axis3_batch10) {
+  TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1});
+}
 
 }  // namespace tensorrt
 }  // namespace inference

From ae7d22862be83c5ca5ed2d820a11fd8ab523766d Mon Sep 17 00:00:00 2001
From: Dun <randonlang@gmail.com>
Date: Thu, 22 Nov 2018 15:42:28 +0800
Subject: [PATCH 71/80] Group Norm (#13843)

Add group normalization operator.
---
 AUTHORS.md                                    |   1 +
 paddle/fluid/API.spec                         |   1 +
 paddle/fluid/operators/group_norm_op.cc       | 162 ++++++++++
 paddle/fluid/operators/group_norm_op.cu       | 292 ++++++++++++++++++
 paddle/fluid/operators/group_norm_op.h        | 197 ++++++++++++
 python/paddle/fluid/layers/nn.py              |  79 +++++
 .../paddle/fluid/tests/unittests/op_test.py   |  10 +-
 .../tests/unittests/test_group_norm_op.py     | 143 +++++++++
 8 files changed, 880 insertions(+), 5 deletions(-)
 create mode 100644 paddle/fluid/operators/group_norm_op.cc
 create mode 100644 paddle/fluid/operators/group_norm_op.cu
 create mode 100644 paddle/fluid/operators/group_norm_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_group_norm_op.py

diff --git a/AUTHORS.md b/AUTHORS.md
index 54a1097b50..deafa64120 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -25,6 +25,7 @@
 | kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
+| cjld | Dun Liang |
 | lipeng-unisound | Peng Li |
 | liuyuan | Yuan Liu |
 | livc | Zhao Li |
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index da8941c351..541c4db1fa 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -103,6 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
+paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None))
 paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
new file mode 100644
index 0000000000..6322659b67
--- /dev/null
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/group_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+class GroupNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of GroupNormOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto channel_num = x_dim[1];
+    auto batch_size = x_dim[0];
+    auto groups = ctx->Attrs().Get<int>("groups");
+    PADDLE_ENFORCE_LE(
+        groups, channel_num,
+        "'groups' must be less equal than the number of channels.");
+    PADDLE_ENFORCE_GE(groups, 1, "'groups' must be greater equal than 1.");
+
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], channel_num);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], channel_num);
+    }
+
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {batch_size, groups});
+    ctx->SetOutputDim("Variance", {batch_size, groups});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class GroupNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Scale",
+             "Scale is a 1-dimensional tensor of size C"
+             "that is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "Bias is a 1-dimensional tensor of size C "
+             "that is applied to the output")
+        .AsDispensable();
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of each group.").AsIntermediate();
+    AddOutput("Variance", "Variance of each group.").AsIntermediate();
+
+    AddAttr<float>("epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 1.0f,
+                         "'epsilon' should be between 0.0 and 1.0.");
+        });
+    AddAttr<int>("groups", "The number of groups that divided from channels.")
+        .AddCustomChecker([](const int &groups) {
+          PADDLE_ENFORCE_GT(groups, 0, "'groups' should be greater than zero.");
+        });
+
+    AddComment(R"DOC(
+Group Normalization
+
+Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_
+)DOC");
+  }
+};
+
+class GroupNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of GroupNormOp should not be null.");
+
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    group_norm_grad,
+    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
new file mode 100644
index 0000000000..2717463022
--- /dev/null
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -0,0 +1,292 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cub/cub.cuh>
+#include "paddle/fluid/operators/group_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C,
+                                              int imsize, int groups,
+                                              int group_size, T* mean, T* var) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = 0, x_var = 0;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val = x[(bid * C + ccid) * imsize + imid];
+    x_mean += val;
+    x_var += val * val;
+  }
+  x_mean /= number * imsize;
+  x_var /= number * imsize;
+  __shared__ T s_mem[2];
+  if (threadIdx.x == 0) {
+    s_mem[0] = s_mem[1] = 0;
+  }
+  __syncthreads();
+  paddle::platform::CudaAtomicAdd(&s_mem[0], x_mean);
+  paddle::platform::CudaAtomicAdd(&s_mem[1], x_var);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    paddle::platform::CudaAtomicAdd(&mean[bid * groups + gid], s_mem[0]);
+    paddle::platform::CudaAtomicAdd(&var[bid * groups + gid], s_mem[1]);
+  }
+}
+
+template <typename T>
+__global__ void GroupNormForward(const T* x, const T* mean, const T* var,
+                                 const T* scale, const T* bias, int N, int C,
+                                 int imsize, int groups, int group_size,
+                                 T epsilon, T* y, T* real_var) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  x_var = x_var - x_mean * x_mean;
+  T var_inv = 1.0 / sqrt(x_var + epsilon);
+  if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val = x[(bid * C + ccid) * imsize + imid];
+    val = (val - x_mean) * var_inv;
+    if (scale) val *= scale[gid * group_size + cid];
+    if (bias) val += bias[gid * group_size + cid];
+    y[(bid * C + ccid) * imsize + imid] = val;
+  }
+}
+
+template <typename T>
+class GroupNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
+    const auto groups = ctx.Attr<int>("groups");
+
+    const auto x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    Tensor temp_var;
+    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+
+    set_zero(dev_ctx, mean, static_cast<T>(0));
+    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+
+    auto* x_data = x->data<T>();
+    auto* y_data = y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    auto* temp_var_data = temp_var.data<T>();
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    int block_size = std::min(512, imsize);
+    dim3 grid(group_size, groups, x_dims[0]);
+    dim3 threads(block_size, 1, 1);
+    GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, x_dims[0], x_dims[1], imsize, groups, group_size, mean_data,
+        temp_var_data);
+    GroupNormForward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, temp_var_data, scale_data, bias_data, x_dims[0],
+        x_dims[1], imsize, groups, group_size, epsilon, y_data, var_data);
+  }
+};
+
+template <typename T>
+__global__ void GroupNormBackwardGetMeanAndVar(
+    const T* x, const T* mean, const T* var, const T* scale, const T* d_y,
+    int N, int C, int imsize, int groups, int group_size, T epsilon, T* d_x,
+    T* d_mean, T* d_var, T* d_scale, T* d_bias) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  T var_inv = 1.0 / sqrt(x_var + epsilon);
+  T d_var_inv = 0, d_x_mean = 0;
+  T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0;
+
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T tmp = x[(bid * C + ccid) * imsize + imid];
+    T val = (tmp - x_mean) * var_inv;
+    T dval = d_y[(bid * C + ccid) * imsize + imid];
+    if (d_bias) d_bias_data += dval;
+    if (d_scale) d_scale_data += val * dval;
+    if (scale) dval = dval * scale[ccid];
+    d_var_data += (tmp - x_mean) * dval;
+    T d_tmp = dval * var_inv;
+    if (d_x) d_x[(bid * C + ccid) * imsize + imid] = d_tmp;
+    d_mean_data -= d_tmp;
+  }
+
+  __shared__ T s_mem[4];
+  if (threadIdx.x == 0) {
+    s_mem[0] = s_mem[1] = 0;
+    if (d_scale) s_mem[2] = 0;
+    if (d_bias) s_mem[3] = 0;
+  }
+  __syncthreads();
+  paddle::platform::CudaAtomicAdd(&s_mem[0], d_mean_data);
+  paddle::platform::CudaAtomicAdd(&s_mem[1], d_var_data);
+  if (d_scale) paddle::platform::CudaAtomicAdd(&s_mem[2], d_scale_data);
+  if (d_bias) paddle::platform::CudaAtomicAdd(&s_mem[3], d_bias_data);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    paddle::platform::CudaAtomicAdd(&d_mean[bid * groups + gid], s_mem[0]);
+    paddle::platform::CudaAtomicAdd(&d_var[bid * groups + gid], s_mem[1]);
+    if (d_scale) paddle::platform::CudaAtomicAdd(&d_scale[ccid], s_mem[2]);
+    if (d_bias) paddle::platform::CudaAtomicAdd(&d_bias[ccid], s_mem[3]);
+  }
+}
+
+template <typename T>
+__global__ void GroupNormBackward(const T* x, const T* mean, const T* var,
+                                  const T* d_mean, const T* d_var, int N, int C,
+                                  int imsize, int groups, int group_size,
+                                  T epsilon, T* d_x) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  T d_x_mean = d_mean[bid * groups + gid];
+  T d_var_inv = d_var[bid * groups + gid];
+
+  T d_x_var =
+      -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
+  d_x_mean -= 2 * d_x_var * x_mean;
+  d_x_var /= number * imsize;
+  d_x_mean /= number * imsize;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T tmp = x[(bid * C + ccid) * imsize + imid];
+    if (d_x)
+      d_x[(bid * C + ccid) * imsize + imid] += d_x_mean + tmp * 2 * d_x_var;
+  }
+}
+
+template <typename T>
+class GroupNormGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto groups = ctx.Attr<int>("groups");
+
+    // init output
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto& x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    T* d_x_data = nullptr;
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      d_x_data = d_x->data<T>();
+    }
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    Tensor temp_var;
+    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+    T* temp_var_data = temp_var.data<T>();
+
+    Tensor temp_mean;
+    temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
+    set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
+    T* temp_mean_data = temp_mean.data<T>();
+
+    auto* x_data = x->data<T>();
+    auto* y_data = d_y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    T* d_scale_data = nullptr;
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      d_scale_data = d_scale->data<T>();
+    }
+    T* d_bias_data = nullptr;
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      d_bias_data = d_bias->data<T>();
+    }
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    int block_size = std::min(512, imsize);
+    dim3 grid(group_size, groups, x_dims[0]);
+    dim3 threads(block_size, 1, 1);
+    GroupNormBackwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, var_data, scale_data, y_data, x_dims[0], x_dims[1],
+        imsize, groups, group_size, epsilon, d_x_data, temp_mean_data,
+        temp_var_data, d_scale_data, d_bias_data);
+    GroupNormBackward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, var_data, temp_mean_data, temp_var_data, x_dims[0],
+        x_dims[1], imsize, groups, group_size, epsilon, d_x_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    group_norm,
+    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    group_norm_grad,
+    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
new file mode 100644
index 0000000000..3d6c6a46a9
--- /dev/null
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename DeviceContext, typename T>
+class GroupNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
+    const auto groups = ctx.Attr<int>("groups");
+
+    const auto x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto* x_data = x->data<T>();
+    auto* y_data = y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    auto* iter_x_data = x_data;
+    auto* iter_y_data = y_data;
+    for (int bid = 0; bid < x_dims[0]; bid++)
+      for (int gid = 0; gid < groups; gid++) {
+        T x_mean = 0, x_var = 0;
+        int number = std::min(group_size,
+                              static_cast<int>(x_dims[1] - gid * group_size));
+        auto* tmp = iter_x_data;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, iter_x_data++) {
+            x_mean += iter_x_data[0];
+            x_var += iter_x_data[0] * iter_x_data[0];
+          }
+        }
+        x_mean /= number * imsize;
+        x_var /= number * imsize;
+        x_var = x_var - x_mean * x_mean;
+        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        mean_data[bid * groups + gid] = x_mean;
+        var_data[bid * groups + gid] = x_var;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, tmp++, iter_y_data++) {
+            T val = (tmp[0] - x_mean) * var_inv;
+            if (scale_data) val *= scale_data[gid * group_size + cid];
+            if (bias_data) val += bias_data[gid * group_size + cid];
+            iter_y_data[0] = val;
+          }
+        }
+      }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GroupNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto groups = ctx.Attr<int>("groups");
+
+    // init output
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto& x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    // TODO(liangdun): need to check d_x is null
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    T* d_x_data = nullptr;
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
+      d_x_data = d_x->data<T>();
+    }
+
+    auto* x_data = x->data<T>();
+    auto* y_data = d_y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    T* d_scale_data = nullptr;
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      d_scale_data = d_scale->data<T>();
+    }
+    T* d_bias_data = nullptr;
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      d_bias_data = d_bias->data<T>();
+    }
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    auto* iter_x_data = x_data;
+    auto* iter_d_x_data = d_x_data;
+    auto* iter_y_data = y_data;
+    for (int bid = 0; bid < x_dims[0]; bid++)
+      for (int gid = 0; gid < groups; gid++) {
+        T x_mean = mean_data[bid * groups + gid];
+        T x_var = var_data[bid * groups + gid];
+        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        int number = std::min(group_size,
+                              static_cast<int>(x_dims[1] - gid * group_size));
+        auto* tmp = iter_x_data;
+        auto* tmp2 = iter_d_x_data;
+        T d_var_inv = 0, d_x_mean = 0;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize;
+               imid++, tmp++, iter_y_data++, iter_d_x_data++) {
+            T val = (tmp[0] - x_mean) * var_inv;
+            T dval = iter_y_data[0];
+            if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
+            if (d_scale_data)
+              d_scale_data[gid * group_size + cid] += val * dval;
+            if (scale_data) dval = scale_data[gid * group_size + cid] * dval;
+
+            d_var_inv += (tmp[0] - x_mean) * dval;
+            T d_tmp = dval * var_inv;
+            if (d_x_data) iter_d_x_data[0] += d_tmp;
+            d_x_mean -= d_tmp;
+          }
+        }
+
+        T d_x_var =
+            -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
+        d_x_mean -= 2 * d_x_var * x_mean;
+        d_x_var /= number * imsize;
+        d_x_mean /= number * imsize;
+
+        iter_d_x_data = tmp2;
+
+        if (d_x_data) {
+          for (int cid = 0; cid < number; cid++) {
+            for (int imid = 0; imid < imsize;
+                 imid++, iter_x_data++, iter_d_x_data++) {
+              iter_d_x_data[0] += d_x_mean;
+              iter_d_x_data[0] += iter_x_data[0] * 2 * d_x_var;
+            }
+          }
+        }
+      }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index e0cc09a4c7..ccd9175b64 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -85,6 +85,7 @@ __all__ = [
     'row_conv',
     'multiplex',
     'layer_norm',
+    'group_norm',
     'softmax_with_cross_entropy',
     'smooth_l1',
     'one_hot',
@@ -2547,6 +2548,84 @@ def layer_norm(input,
     return helper.append_activation(layer_norm_out)
 
 
+@templatedoc()
+def group_norm(input,
+               groups,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               act=None,
+               data_layout='NCHW',
+               name=None):
+    """
+    **Group Normalization Layer**
+
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
+
+    Args:
+        input(Variable): The input tensor variable.
+        groups(int): The number of groups that divided from channels.
+        epsilon(float): The small value added to the variance to prevent
+            division by zero.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            scale :math:`g`. If it is set to False, no scale will be added to the output units.
+            If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act(str): Activation to be applied to the output of group normalizaiton.
+        data_layout(string|NCHW): Only NCHW is supported.
+        name (str): The name of this layer. It is optional.
+
+    Returns:
+        Variable: A tensor variable which is the result after applying group normalization on the input.
+
+    Examples:
+
+        >>> data = fluid.layers.data(name='data', shape=[8, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.group_norm(input=data, groups=4)
+    """
+    helper = LayerHelper('group_norm', **locals())
+    dtype = helper.input_dtype()
+
+    # create intput and parameters
+    inputs = {'X': input}
+    input_shape = input.shape
+    if data_layout != 'NCHW':
+        raise ValueError("unsupported data layout:" + data_layout)
+    param_shape = [input_shape[1]]
+    if param_attr:
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        inputs['Scale'] = scale
+    if bias_attr:
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+
+    # create output
+    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    group_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="group_norm",
+        inputs=inputs,
+        outputs={
+            "Y": group_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "groups": groups})
+
+    return helper.append_activation(group_norm_out)
+
+
 def conv2d_transpose(input,
                      num_filters,
                      output_size=None,
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index c195a28e45..271b9c740f 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -381,8 +381,8 @@ class OpTest(unittest.TestCase):
             outs.sort(key=len)
             checker(outs)
 
-    def __assert_is_close(self, numeric_grads, analytic_grads, names,
-                          max_relative_error, msg_prefix):
+    def _assert_is_close(self, numeric_grads, analytic_grads, names,
+                         max_relative_error, msg_prefix):
 
         for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
             abs_a = np.abs(a)
@@ -451,9 +451,9 @@ class OpTest(unittest.TestCase):
         analytic_grads = self._get_gradient(inputs_to_check, place,
                                             output_names, no_grad_set)
 
-        self.__assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
-                               max_relative_error,
-                               "Gradient Check On %s" % str(place))
+        self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
+                              max_relative_error,
+                              "Gradient Check On %s" % str(place))
 
     @staticmethod
     def _numpy_to_lod_tensor(np_value, lod, place):
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op.py b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
new file mode 100644
index 0000000000..0b6d039f05
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from operator import mul
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+
+from testsuite import create_op
+
+
+def group_norm_naive(x, scale, bias, epsilon, groups):
+    N, C, H, W = x.shape
+    G = groups
+    x = x.reshape((N * G, -1))
+    mean = np.mean(x, axis=1, keepdims=True)
+    var = np.var(x, axis=1, keepdims=True)
+    output = (x - mean) / np.sqrt(var + epsilon)
+    output = output.reshape((N, C, H, W)) * scale.reshape(
+        (-1, 1, 1)) + bias.reshape((-1, 1, 1))
+    return output, mean.reshape((N, G)), var.reshape((N, G))
+
+
+class TestGroupNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "group_norm"
+        self.data_format = "NCHW"
+        self.dtype = np.float32
+        self.shape = (2, 4, 3, 3)
+        self.attrs = {'epsilon': 1e-5, 'groups': 2}
+        self.compare_between_place = False
+        self.init_test_case()
+
+        input = np.random.random(self.shape).astype(self.dtype)
+        scale = np.random.random([self.shape[1]]).astype(self.dtype)
+        bias = np.random.random([self.shape[1]]).astype(self.dtype)
+        output, mean, var = group_norm_naive(
+            input, scale, bias, self.attrs['epsilon'], self.attrs['groups'])
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(input),
+            'Scale': OpTest.np_dtype_to_fluid_dtype(scale),
+            'Bias': OpTest.np_dtype_to_fluid_dtype(bias)
+        }
+        self.outputs = {'Y': output, 'Mean': mean, 'Variance': var}
+
+    def test_check_output(self):
+        atol = 1e-4
+        place = core.CPUPlace()
+        self.check_output_with_place(place, atol=atol)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=atol)
+
+    def do_compare_between_place(self):
+        if not core.is_compiled_with_cuda(): return
+        place = core.CPUPlace()
+        place2 = core.CUDAPlace(0)
+        self.scope = core.Scope()
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
+        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
+        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
+        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
+                            op_attrs)
+        inputs_to_check = set(['X', 'Scale', 'Bias'])
+        output_names = 'Y'
+        cpu_grads = self._get_gradient(inputs_to_check, place, output_names,
+                                       None)
+        gpu_grads = self._get_gradient(inputs_to_check, place2, output_names,
+                                       None)
+        self._assert_is_close(cpu_grads, gpu_grads, inputs_to_check, 0.005,
+                              "Gradient Check On %s" % str(place))
+
+    def test_check_grad(self):
+        if self.compare_between_place:
+            self.do_compare_between_place()
+            return
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, set(['X', 'Scale', 'Bias']), 'Y', max_relative_error=0.01)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['X', 'Scale', 'Bias']),
+                'Y',
+                max_relative_error=0.01)
+
+    def init_test_case(self):
+        pass
+
+
+class TestGroupNormOp1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+
+
+class TestGroupNormOp2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+
+
+class TestGroupNormOpBigEps1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpBigEps2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpBigEps3(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpLargeData(TestGroupNormOp):
+    def init_test_case(self):
+        self.shape = (2, 32, 64, 64)
+        self.attrs['groups'] = 8
+        self.compare_between_place = True
+
+
+if __name__ == '__main__':
+    unittest.main()

From dd6fd4c747df9ad5ffdf0f6eef8ef3683df871cb Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 22 Nov 2018 16:49:45 +0800
Subject: [PATCH 72/80] Utils for download and upload files with HDFS (#14473)

* add hdfs utils

* add hdfs utils

* test=develop

* update hdfs utils and add demo

* fix multi_download return local files

* test=develop

* add sync multi upload, test=develop
---
 python/paddle/fluid/contrib/utils/__init__.py |  20 +
 .../paddle/fluid/contrib/utils/hdfs_utils.py  | 505 ++++++++++++++++++
 2 files changed, 525 insertions(+)
 create mode 100644 python/paddle/fluid/contrib/utils/__init__.py
 create mode 100644 python/paddle/fluid/contrib/utils/hdfs_utils.py

diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py
new file mode 100644
index 0000000000..df6d367782
--- /dev/null
+++ b/python/paddle/fluid/contrib/utils/__init__.py
@@ -0,0 +1,20 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import hdfs_utils
+from .hdfs_utils import *
+
+__all__ = hdfs_utils.__all__
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
new file mode 100644
index 0000000000..251665d85e
--- /dev/null
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
@@ -0,0 +1,505 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HDFS Utils"""
+
+import os
+import subprocess
+import multiprocessing
+from datetime import datetime
+
+import re
+import copy
+import errno
+
+import logging
+
+__all__ = ["HDFSClient", "multi_download"]
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+_logger = logging.getLogger("hdfs_utils")
+_logger.setLevel(logging.INFO)
+
+
+class HDFSClient(object):
+    def __init__(self, hadoop_home, configs):
+        self.pre_commands = []
+        hadoop_bin = '%s/bin/hadoop' % hadoop_home
+        self.pre_commands.append(hadoop_bin)
+        dfs = 'fs'
+        self.pre_commands.append(dfs)
+
+        for k, v in configs.iteritems():
+            config_command = '-D%s=%s' % (k, v)
+            self.pre_commands.append(config_command)
+
+    def __run_hdfs_cmd(self, commands, retry_times=5):
+        whole_commands = copy.deepcopy(self.pre_commands)
+        whole_commands.extend(commands)
+
+        print('Running system command: {0}'.format(' '.join(whole_commands)))
+
+        ret_code = 0
+        ret_out = None
+        ret_err = None
+        for x in range(retry_times + 1):
+            proc = subprocess.Popen(
+                whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            (output, errors) = proc.communicate()
+            ret_code, ret_out, ret_err = proc.returncode, output, errors
+            if ret_code:
+                _logger.warn(
+                    'Times: %d, Error running command: %s. Return code: %d, Error: %s'
+                    % (x, ' '.join(whole_commands), proc.returncode, errors))
+            else:
+                break
+        return ret_code, ret_out, ret_err
+
+    def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
+        """
+            upload the local file to hdfs
+            args:
+                local_file_path: the local file path
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+            return:
+                True or False
+        """
+        assert hdfs_path is not None
+        assert local_path is not None and os.path.exists(local_path)
+
+        if os.path.isdir(local_path):
+            _logger.warn(
+                "The Local path: {} is dir and I will support it later, return".
+                format(local_path))
+            return
+
+        base = os.path.basename(local_path)
+        if not self.is_exist(hdfs_path):
+            self.makedirs(hdfs_path)
+        else:
+            if self.is_exist(os.path.join(hdfs_path, base)):
+                if overwrite:
+                    _logger.error(
+                        "The HDFS path: {} is exist and overwrite is True, delete it".
+                        format(hdfs_path))
+                    self.delete(hdfs_path)
+                else:
+                    _logger.error(
+                        "The HDFS path: {} is exist and overwrite is False, return".
+                        format(hdfs_path))
+                    return False
+
+        put_commands = ["-put", local_path, hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(put_commands,
+                                                         retry_times)
+        if returncode:
+            _logger.error("Put local path: {} to HDFS path: {} failed".format(
+                local_path, hdfs_path))
+            return False
+        else:
+            _logger.info("Put local path: {} to HDFS path: {} successfully".
+                         format(local_path, hdfs_path))
+            return True
+
+    def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
+        """
+            download from hdfs
+            args:
+                local_file_path: the local file path
+                remote_file_path: remote dir on hdfs
+            return:
+                True or False
+        """
+        _logger.info('Downloading %r to %r.', hdfs_path, local_path)
+        _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
+
+        if not self.is_exist(hdfs_path):
+            print("HDFS path: {} do not exist".format(hdfs_path))
+            return False
+        if self.is_dir(hdfs_path):
+            _logger.error(
+                "The HDFS path: {} is dir and I will support it later, return".
+                format(hdfs_path))
+
+        if os.path.exists(local_path):
+            base = os.path.basename(hdfs_path)
+            local_file = os.path.join(local_path, base)
+            if os.path.exists(local_file):
+                if overwrite:
+                    os.remove(local_file)
+                else:
+                    _logger.error(
+                        "The Local path: {} is exist and overwrite is False, return".
+                        format(local_file))
+                    return False
+
+        self.make_local_dirs(local_path)
+
+        download_commands = ["-get", hdfs_path, local_path]
+        returncode, output, errors = self.__run_hdfs_cmd(download_commands)
+        if returncode:
+            _logger.error("Get local path: {} from HDFS path: {} failed".format(
+                local_path, hdfs_path))
+            return False
+        else:
+            _logger.info("Get local path: {} from HDFS path: {} successfully".
+                         format(local_path, hdfs_path))
+            return True
+
+    def is_exist(self, hdfs_path=None):
+        """
+            whether the remote hdfs path exists?
+            args:
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+                fs_name: The default values are the same as in the job configuration
+                fs_ugi: The default values are the same as in the job configuration
+            return:
+                True or False
+        """
+        exist_cmd = ['-test', '-e', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            exist_cmd, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS is_exist HDFS path: {} failed".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
+                hdfs_path))
+            return True
+
+    def is_dir(self, hdfs_path=None):
+        """
+            whether the remote hdfs path exists?
+            args:
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+                fs_name: The default values are the same as in the job configuration
+                fs_ugi: The default values are the same as in the job configuration
+            return:
+                True or False
+        """
+
+        if not self.is_exist(hdfs_path):
+            return False
+
+        dir_cmd = ['-test', '-d', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS path: {} failed is not a directory".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS path: {} successfully is a directory".format(
+                hdfs_path))
+            return True
+
+    def delete(self, hdfs_path):
+        """Remove a file or directory from HDFS.
+
+        :param hdfs_path: HDFS path.
+        :param recursive: Recursively delete files and directories. By default,
+          this method will raise an :class:`HdfsError` if trying to delete a
+          non-empty directory.
+
+        This function returns `True` if the deletion was successful and `False` if
+        no file or directory previously existed at `hdfs_path`.
+
+        """
+        _logger.info('Deleting %r.', hdfs_path)
+
+        if not self.is_exist(hdfs_path):
+            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
+            return True
+
+        if self.is_dir(hdfs_path):
+            del_cmd = ['-rmr', hdfs_path]
+        else:
+            del_cmd = ['-rm', hdfs_path]
+
+        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
+
+        if returncode:
+            _logger.error("HDFS path: {} delete files failure".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS path: {} delete files successfully".format(
+                hdfs_path))
+            return True
+
+    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
+        """Move a file or folder.
+
+        :param hdfs_src_path: Source path.
+        :param hdfs_dst_path: Destination path. If the path already exists and is
+          a directory, the source will be moved into it. If the path exists and is
+          a file, or if a parent destination directory is missing, this method will
+          raise an :class:`HdfsError`.
+
+        """
+        assert hdfs_src_path is not None
+        assert hdfs_dst_path is not None
+
+        if not self.is_exist(hdfs_src_path):
+            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
+        if self.is_exist(hdfs_dst_path) and not overwrite:
+            _logger.error("HDFS path is exist: {} and overwrite=False".format(
+                hdfs_dst_path))
+
+        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            rename_command, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS rename path: {} to {} failed".format(
+                hdfs_src_path, hdfs_dst_path))
+            return False
+        else:
+            _logger.info("HDFS rename path: {} to {} successfully".format(
+                hdfs_src_path, hdfs_dst_path))
+            return True
+
+    @staticmethod
+    def make_local_dirs(local_path):
+        try:
+            os.makedirs(local_path)
+        except OSError as e:
+            if e.errno != errno.EEXIST:
+                raise
+
+    def makedirs(self, hdfs_path):
+        """Create a remote directory, recursively if necessary.
+
+        :param hdfs_path: Remote path. Intermediate directories will be created
+          appropriately.
+        """
+        _logger.info('Creating directories to %r.', hdfs_path)
+        assert hdfs_path is not None
+
+        if self.is_exist(hdfs_path):
+            return
+
+        mkdirs_commands = ['-mkdir', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            mkdirs_commands, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
+            return False
+        else:
+            _logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
+            return True
+
+    def ls(self, hdfs_path):
+        assert hdfs_path is not None
+
+        if not self.is_exist(hdfs_path):
+            return []
+
+        ls_commands = ['-ls', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            ls_commands, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS list path: {} failed".format(hdfs_path))
+            return []
+        else:
+            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
+
+            ret_lines = []
+            regex = re.compile('\s+')
+            out_lines = output.strip().split("\n")
+            for line in out_lines:
+                re_line = regex.split(line)
+                if len(re_line) == 8:
+                    ret_lines.append(re_line[7])
+            return ret_lines
+
+    def lsr(self, hdfs_path, only_file=True, sort=True):
+        def sort_by_time(v1, v2):
+            v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
+            v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
+            return v1_time > v2_time
+
+        assert hdfs_path is not None
+
+        if not self.is_exist(hdfs_path):
+            return []
+
+        ls_commands = ['-lsr', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            ls_commands, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
+            return []
+        else:
+            _logger.info("HDFS list all files: {} successfully".format(
+                hdfs_path))
+            lines = []
+            regex = re.compile('\s+')
+            out_lines = output.strip().split("\n")
+            for line in out_lines:
+                re_line = regex.split(line)
+                if len(re_line) == 8:
+                    if only_file and re_line[0][0] == "d":
+                        continue
+                    else:
+                        lines.append(
+                            (re_line[7], re_line[5] + " " + re_line[6]))
+            if sort:
+                sorted(lines, cmp=sort_by_time)
+            ret_lines = [ret[0] for ret in lines]
+            return ret_lines
+
+
+def multi_upload(client,
+                 hdfs_path,
+                 local_path,
+                 multi_processes=5,
+                 overwrite=False):
+    """
+    :param overwrite: will overwrite hdfs file or not
+    :param multi_processes: the upload data process at the same time, default=5
+    :param client: instance of HDFSClient
+    :param hdfs_path: path on hdfs
+    :param local_path: path on local
+    :return:
+    """
+
+    def __subprocess_upload(datas):
+        for data in datas:
+            re_path = os.path.relpath(os.path.dirname(data), local_path)
+            hdfs_re_path = os.path.join(hdfs_path, re_path)
+            client.upload(hdfs_re_path, data, overwrite, retry_times=5)
+
+    def get_local_files(path):
+        rlist = []
+
+        if not os.path.isdir(path):
+            return rlist
+
+        for dirname, folder, files in os.walk(path):
+            for i in files:
+                t = os.path.join(dirname, i)
+                rlist.append(t)
+        return rlist
+
+    assert isinstance(client, HDFSClient)
+
+    all_files = get_local_files(local_path)
+    if not all_files:
+        _logger.info("there are nothing need to upload, exit")
+        return
+    _logger.info("Start {} multi process to upload datas".format(
+        multi_processes))
+    procs = []
+    for i in range(multi_processes):
+        process_datas = all_files[i::multi_processes]
+        p = multiprocessing.Process(
+            target=__subprocess_upload, args=(process_datas, ))
+        procs.append(p)
+        p.start()
+
+    # complete the processes
+    for proc in procs:
+        proc.join()
+
+    _logger.info("Finish {} multi process to upload datas".format(
+        multi_processes))
+
+
+def multi_download(client,
+                   hdfs_path,
+                   local_path,
+                   trainer_id,
+                   trainers,
+                   multi_processes=5):
+    """
+    multi_download
+    :param client: instance of HDFSClient
+    :param hdfs_path: path on hdfs
+    :param local_path: path on local
+    :param trainer_id: current trainer id
+    :param trainers: all trainers number
+    :param multi_processes: the download data process at the same time, default=5
+    :return: None
+    """
+
+    def __subprocess_download(datas):
+        for data in datas:
+            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
+            local_re_path = os.path.join(local_path, re_path)
+            client.download(data, local_re_path)
+
+    assert isinstance(client, HDFSClient)
+
+    client.make_local_dirs(local_path)
+    _logger.info("Make local dir {} successfully".format(local_path))
+
+    all_need_download = client.lsr(hdfs_path, sort=True)
+    need_download = all_need_download[trainer_id::trainers]
+    _logger.info("Get {} files From all {} files need to be download from {}".
+                 format(len(need_download), len(all_need_download), hdfs_path))
+
+    _logger.info("Start {} multi process to download datas".format(
+        multi_processes))
+    procs = []
+    for i in range(multi_processes):
+        process_datas = need_download[i::multi_processes]
+        p = multiprocessing.Process(
+            target=__subprocess_download, args=(process_datas, ))
+        procs.append(p)
+        p.start()
+
+    # complete the processes
+    for proc in procs:
+        proc.join()
+
+    _logger.info("Finish {} multi process to download datas".format(
+        multi_processes))
+
+    local_downloads = []
+    for data in need_download:
+        data_name = os.path.basename(data)
+        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
+        local_re_path = os.path.join(local_path, re_path, data_name)
+        local_downloads.append(local_re_path)
+
+    return local_downloads
+
+
+if __name__ == "__main__":
+    hadoop_home = "/home/client/hadoop-client/hadoop/"
+
+    configs = {
+        "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+        "hadoop.job.ugi": "hello,hello123"
+    }
+
+    client = HDFSClient(hadoop_home, configs)
+
+    client.ls("/user/com/train-25")
+    files = client.lsr("/user/com/train-25/models")
+
+    downloads = multi_download(
+        client,
+        "/user/com/train-25/model",
+        "/home/xx/data1",
+        1,
+        5,
+        multi_processes=5)
+
+    multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")

From 510601b2793047858763032b7816af07ab2b2bc7 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 22 Nov 2018 09:01:08 +0000
Subject: [PATCH 73/80] test=develop

---
 python/paddle/fluid/layers/nn.py                   | 10 +++++++---
 python/paddle/fluid/tests/unittests/test_layers.py |  7 ++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 32d411b830..27f83a60bd 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2139,12 +2139,16 @@ def pool2d(input,
                           input tensor is NCHW, where N is batch size, C is
                           the number of channels, H is the height of the
                           feature, and W is the width of the feature.
-        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple,
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
         pool_type: ${pooling_type_comment}
-        pool_stride (int): stride of the pooling layer.
-        pool_padding (int): padding size.
+        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
+            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
+            Otherwise, the pool padding size will be a square of an int.
         global_pooling (bool): ${global_pooling_comment}
         use_cudnn (bool): ${use_cudnn_comment}
         ceil_mode (bool): ${ceil_mode_comment}
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index c4310fe006..559c9cda48 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -206,7 +206,12 @@ class TestBook(unittest.TestCase):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
-            self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3]))
+            self.assertIsNotNone(
+                layers.pool2d(
+                    x,
+                    pool_size=[5, 3],
+                    pool_stride=[1, 2],
+                    pool_padding=(2, 1)))
 
     def test_lstm_unit(self):
         program = Program()

From 83370576cd8f35e4155d94a789c886c8c264056d Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 22 Nov 2018 18:52:54 +0800
Subject: [PATCH 74/80] Add sqlite3 support in Python3.6

test=develop
---
 tools/manylinux1/build_scripts/build_utils.sh | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
index d97745ad2d..48cce15a14 100755
--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -50,6 +50,15 @@ function do_cpython_build {
     mkdir -p ${prefix}/lib
     # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
 
+    if [ $(lex_pyver $py_ver) -eq $(lex_pyver 3.6) ]; then
+        wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
+        tar -zxf sqlite-autoconf-3250300.tar.gz
+        cd sqlite-autoconf-3250300
+        ./configure --prefix=/usr/local
+        make -j8 && make install
+        cd ../ && rm sqlite-autoconf-3250300.tar.gz
+    fi
+
     # NOTE --enable-shared for generating libpython shared library needed for
     # linking of some of the nupic.core test executables.
     if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
@@ -59,9 +68,9 @@ function do_cpython_build {
         make -j8 > /dev/null
         make altinstall > /dev/null
     else
-        CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
-        make -j8 > /dev/null
-        make install > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make -j8 > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make install > /dev/null
     fi
     popd
     echo "ZZZ looking for libpython"

From 00b9e9a1357bb3fa6e6adceb4e650d9f6424aa2a Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Thu, 22 Nov 2018 20:40:56 +0800
Subject: [PATCH 75/80] Refine cublas to support CUBLAS_TENSOR_OP_MATH (#13929)

* refine cublase
test=develop

* code refine

* refine cublas

* add GEMME_EX

* add enable_cublas_tensor_op_math doc and add cublasCall
test=develop

* fix CublasCall for cuda version
test=develop

* fix error
test=develop

* fix GEMM_EX to be compatible with gcc 4.8
test=develop

* add GEMM_EX
test=develop

* to compatiable with gcc4.8
test=develop
---
 paddle/fluid/operators/math/blas_impl.cu.h | 206 +++++++++++++++++----
 paddle/fluid/platform/device_context.h     |  47 +++++
 paddle/fluid/platform/dynload/cublas.h     |  26 ++-
 paddle/fluid/platform/gpu_info.cc          |  20 ++
 paddle/fluid/platform/gpu_info.h           |   3 +
 python/paddle/fluid/__init__.py            |   3 +-
 6 files changed, 256 insertions(+), 49 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index d84c88cb3b..d35073029a 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -16,6 +16,9 @@
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+DECLARE_bool(enable_cublas_tensor_op_math);
 
 namespace paddle {
 namespace operators {
@@ -42,11 +45,44 @@ struct CUBlas<float> {
   }
 
   template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
+  static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...));
 #else
     PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(platform::CUDADeviceContext *dev_ctx,
+                      cublasOperation_t transa, cublasOperation_t transb, int m,
+                      int n, int k, const float *alpha, const void *A,
+                      cudaDataType_t Atype, int lda, const void *B,
+                      cudaDataType_t Btype, int ldb, const float *beta, void *C,
+                      cudaDataType_t Ctype, int ldc) {
+    // Because the gcc 4.8 doesn't expand template parameter pack that
+    // appears in a lambda-expression, I can not use template parameter pack
+    // here.
+    auto cublas_call = [&]() {
+#if CUDA_VERSION >= 8000
+      VLOG(5) << "use_tensor_op_math: "
+              << (platform::TensorCoreAvailable() ? "True" : "False");
+      PADDLE_ENFORCE(platform::dynload::cublasSgemmEx(
+          dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
+          lda, B, Btype, ldb, beta, C, Ctype, ldc));
+#else
+      PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
+#endif
+    };
+
+#if CUDA_VERSION >= 9000
+    // NOTES: To use Tensor Core, we should change the cublas config,
+    // but the cublas may be hold by multi-thread.
+    dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+#else
+    cublas_call();
 #endif
   }
 };
@@ -69,13 +105,18 @@ struct CUBlas<double> {
   }
 
   template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
+  static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...));
 #else
     PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
 #endif
   }
+
+  template <typename... ARGS>
+  static void GEMM_EX(ARGS... args) {
+    PADDLE_THROW("Currently there are not cublasDgemmEx.");
+  }
 };
 
 template <>
@@ -96,14 +137,16 @@ struct CUBlas<platform::float16> {
                                        reinterpret_cast<__half *>(C), ldc));
   }
 
-  static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa,
-                         cublasOperation_t transb, int m, int n, int k,
-                         const float16 *alpha, const float16 *A, int lda,
-                         long long int strideA, const float16 *B,  // NOLINT
-                         int ldb, long long int strideB,           // NOLINT
-                         const float16 *beta, float16 *C, int ldc,
-                         long long int strideC,  // NOLINT
-                         int batchCount) {
+  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb, int m, int n, int k,
+                                 const float16 *alpha, const float16 *A,
+                                 int lda, long long int strideA,  // NOLINT
+                                 const float16 *B,                // NOLINT
+                                 int ldb, long long int strideB,  // NOLINT
+                                 const float16 *beta, float16 *C, int ldc,
+                                 long long int strideC,  // NOLINT
+                                 int batchCount) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
         handle, transa, transb, m, n, k,
@@ -114,6 +157,45 @@ struct CUBlas<platform::float16> {
         ldc, strideC, batchCount));
 #else
     PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(platform::CUDADeviceContext *dev_ctx,
+                      cublasOperation_t transa, cublasOperation_t transb, int m,
+                      int n, int k, const void *alpha, const void *A,
+                      cudaDataType_t Atype, int lda, const void *B,
+                      cudaDataType_t Btype, int ldb, const void *beta, void *C,
+                      cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+    auto cublas_call = [&]() {
+#if CUDA_VERSION >= 8000
+      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+      bool use_tensor_op_math = platform::TensorCoreAvailable();
+      if (use_tensor_op_math) {
+        algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+      }
+      VLOG(5) << "use_tensor_op_math: "
+              << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+      PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+          dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
+          lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo));
+#else
+      PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
+#endif
+    };
+
+#if CUDA_VERSION >= 9000
+    // NOTES: To use Tensor Core, we should change the cublas config,
+    // but the cublas may be hold by multi-thread.
+    dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+#else
+    cublas_call();
 #endif
   }
 };
@@ -133,8 +215,21 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
-  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
-                  B, ldb, A, lda, &beta, C, N);
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, N);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
+                    &alpha, B, ldb, A, lda, &beta, C, N);
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
 }
 
 template <>
@@ -157,30 +252,18 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
   PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
                     "cublas fp16 gemm requires GPU compute capability >= 53");
 
-#if CUDA_VERSION >= 8000
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
 
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-  if (context_.GetComputeCapability() >= 70) {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
-        context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH));
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
-        context_.cublas_handle(), CUBLAS_DEFAULT_MATH));
-  }
-#endif  // CUDA_VERSION >= 9000
-
+#if CUDA_VERSION >= 8000
   // cublasHgemm does true FP16 computation which is slow for non-Volta
   // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
   // input/output in fp16, computation in fp32, which can also be accelerated
   // using tensor cores in volta GPUs.
-  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
-      context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
-      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
-      CUDA_R_32F, algo));
+  auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+  CUBlas<platform::float16>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A,
+      CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
   CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
@@ -199,8 +282,38 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
   // the cblas convention.
   cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
-                  B, ldb, A, lda, &beta, C, ldc);
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, ldc);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
+                    &alpha, B, ldb, A, lda, &beta, C, ldc);
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    bool transA, bool transB, int M, int N, int K, platform::float16 alpha,
+    const platform::float16 *A, int lda, const platform::float16 *B, int ldb,
+    platform::float16 beta, platform::float16 *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &alpha, B, ldb, A, lda, &beta, C,
+                                  ldc);
 }
 
 template <>
@@ -238,9 +351,34 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int64_t strideC = M * N;
 
-  CUBlas<T>::GEMM_BATCH(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
-                        &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc,
-                        strideC, batchCount);
+#if CUDA_VERSION >= 9010
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto cublas_call = [&]() {
+      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+      bool use_tensor_op_math = platform::TensorCoreAvailable();
+      if (use_tensor_op_math) {
+        algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+      }
+      VLOG(5) << "use_tensor_op_math: "
+              << (use_tensor_op_math ? "True" : "False");
+
+      PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx(
+          context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B,
+          CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C,
+          CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo));
+    };
+    auto &dev_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+  } else {
+#endif  // CUDA_VERSION >= 9010
+
+    CUBlas<T>::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &alpha, B, ldb, strideB, A, lda,
+                                  strideA, &beta, C, ldc, strideC, batchCount);
+
+#if CUDA_VERSION >= 9010
+  }
+#endif  // CUDA_VERSION >= 9010
 }
 
 }  // namespace math
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 9a9018cdea..3edd727978 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -143,6 +143,39 @@ class CudnnWorkspaceHandle {
   std::unique_ptr<std::lock_guard<std::mutex>> guard_;
 };
 
+#if CUDA_VERSION >= 9000
+class ScopedCublasMathMode {
+ public:
+  ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode)
+      : handle_(handle) {
+    need_reset = false;
+    PADDLE_ENFORCE(
+        platform::dynload::cublasGetMathMode(handle_, &old_math_mode_),
+        "Failed to get old cublas math mode");
+    if (old_math_mode_ != new_math_mode) {
+      PADDLE_ENFORCE(
+          platform::dynload::cublasSetMathMode(handle_, new_math_mode),
+          "Failed to set old cublas math mode");
+      need_reset = true;
+    }
+  }
+
+  ~ScopedCublasMathMode() {
+    if (need_reset) {
+      PADDLE_ENFORCE(
+          platform::dynload::cublasSetMathMode(handle_, old_math_mode_),
+          "Failed to set old cublas math mode");
+    }
+  }
+
+ private:
+  cublasHandle_t handle_;
+  cublasMath_t old_math_mode_;
+  bool need_reset;
+};
+
+#endif
+
 class CUDADeviceContext : public DeviceContext {
  public:
   explicit CUDADeviceContext(CUDAPlace place);
@@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext {
     callback_manager_->Wait();
   }
 
+#if CUDA_VERSION >= 9000
+  /*! \brief CublasCall may need to change cublas's config,
+   *  but the cublas may be hold by multi-thread, so we should
+   *  add lock here. */
+  template <typename Callback>
+  void CublasCall(Callback callback, cublasMath_t new_math) {
+    std::lock_guard<std::mutex> guard(cublas_mtx_);
+    ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math);
+    callback();
+  }
+#endif
+
  private:
   CUDAPlace place_;
 
@@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext {
   // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
   mutable std::mutex callback_mtx_;
   std::unique_ptr<StreamCallbackManager> callback_manager_;
+
+  mutable std::mutex cublas_mtx_;
 };
 
 template <>
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 4ea0cd7283..ff80bd525c 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -61,9 +61,6 @@ extern void *cublas_dso_handle;
   extern DynLoad__##__name __name
 #endif
 
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
-  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSaxpy_v2);                \
   __macro(cublasDaxpy_v2);                \
@@ -93,22 +90,23 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
 // APIs available after CUDA 8.0
 #if CUDA_VERSION >= 8000
-#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
-  __macro(cublasGemmEx);                     \
-  __macro(cublasSgemmStridedBatched);        \
-  __macro(cublasDgemmStridedBatched);        \
-  __macro(cublasCgemmStridedBatched);        \
-  __macro(cublasZgemmStridedBatched);        \
-  __macro(cublasHgemmStridedBatched);
-
-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched);
 #endif
 
 // APIs available after CUDA 9.0
 #if CUDA_VERSION >= 9000
-#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode);
+#endif
 
-CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#if CUDA_VERSION >= 9010
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx);
 #endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index c78f159ad2..833d48347f 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -26,6 +26,16 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
               "additional trunks of the same size will be requested from gpu "
               "until the gpu has no memory left for another trunk.");
 
+DEFINE_bool(
+    enable_cublas_tensor_op_math, false,
+    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
+    "but it may loss precision. Currently, There are two CUDA libraries that"
+    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
+    " GEMM computations(the matrices must be either half precision or single "
+    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
+    "input and output must be half precision) and recurrent neural networks "
+    "(RNNs).");
+
 namespace paddle {
 namespace platform {
 
@@ -64,6 +74,16 @@ int GetCUDADriverVersion(int id) {
   return driver_version;
 }
 
+bool TensorCoreAvailable() {
+#if CUDA_VERSION >= 9000
+  int device = GetCurrentDeviceId();
+  int driver_version = GetCUDAComputeCapability(device);
+  return driver_version >= 70;
+#else
+  return false;
+#endif
+}
+
 int GetCUDAMultiProcessors(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int count;
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index be44158431..6a0b3c8e02 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id);
 //! Get the driver version of the ith GPU
 int GetCUDADriverVersion(int id);
 
+//! Wheter the current device support TensorCore
+bool TensorCoreAvailable();
+
 //! Get the MultiProcessors of the ith GPU.
 int GetCUDAMultiProcessors(int i);
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 543acf2d34..3c092dee34 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -133,7 +133,8 @@ def __bootstrap__():
     if core.is_compiled_with_cuda():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
-            'conv_workspace_size_limit', 'cudnn_exhaustive_search'
+            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
+            'cudnn_exhaustive_search'
         ]
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])

From 6cc6bf4074d69c5c0b02af612b94e438d596803a Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Thu, 22 Nov 2018 15:30:43 +0100
Subject: [PATCH 76/80] Bumped MKL-DNN version to 0.17

test=develop
---
 cmake/external/mkldnn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 785148d4f9..b280db23b9 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -53,7 +53,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "21fb5f2af1dd14e132af4f1b79160977ee487818"
+    GIT_TAG             "830a10059a018cd2634d94195140cf2d8790a75a"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

From a902b8b0f811f6837330385b95fa2f552393197c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 23 Nov 2018 01:07:58 +0800
Subject: [PATCH 77/80] Add sqlite3 support

test=develop
---
 tools/manylinux1/Dockerfile.x64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index e91216a5b8..48fd145e5f 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && \
-  bash build_scripts/install_nccl2.sh && rm -r build_scripts
+  bash build_scripts/install_nccl2.sh && rm -rf build_scripts
 
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 

From 9ea1ce63192fee1a211aa5dcc6fecf4758434451 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 23 Nov 2018 11:51:07 +0800
Subject: [PATCH 78/80] Update issue templates

---
 .github/ISSUE_TEMPLATE/---feature-request-.md | 27 +++++++++++++
 .github/ISSUE_TEMPLATE/---inference-issue-.md | 40 +++++++++++++++++++
 .../ISSUE_TEMPLATE/---installation-issue-.md  | 40 +++++++++++++++++++
 .github/ISSUE_TEMPLATE/---model-issue-.md     | 36 +++++++++++++++++
 .github/ISSUE_TEMPLATE/---others-.md          | 33 +++++++++++++++
 .github/ISSUE_TEMPLATE/---training-issue-.md  | 38 ++++++++++++++++++
 6 files changed, 214 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/---feature-request-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---inference-issue-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---installation-issue-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---model-issue-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---others-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---training-issue-.md

diff --git a/.github/ISSUE_TEMPLATE/---feature-request-.md b/.github/ISSUE_TEMPLATE/---feature-request-.md
new file mode 100644
index 0000000000..57708855dc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---feature-request-.md
@@ -0,0 +1,27 @@
+---
+name: 建议(Feature request)
+about: 您可以提出您的建议。 You could use this template for reporting a suggestion  issue.
+
+---
+
+欢迎您对PaddlePaddle提出建议，非常感谢您对PaddlePaddle的贡献！
+在留下您的建议时，辛苦您同步提供如下信息：
+- 版本、环境信息
+1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1
+2）CPU/GPU：您是否使用GPU进行训练，如是，请提供您的CUDA和cuDNN版本号
+3）系统环境：请您描述系统类型、版本，例如Mac OS 10.14
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 建议描述：请您详细描述，您认为需优化的功能
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+Please make sure that this is a feature request. 
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe the feature and the current behavior/state.**
+**Any Other info.**
diff --git a/.github/ISSUE_TEMPLATE/---inference-issue-.md b/.github/ISSUE_TEMPLATE/---inference-issue-.md
new file mode 100644
index 0000000000..37bdc8889e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---inference-issue-.md
@@ -0,0 +1,40 @@
+---
+name: 预测（Inference Issue）
+about: 您可以提问预测中报错、应用等问题。 You could use this template for reporting an inference issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准描述您的问题，例如“最新预测库的API文档在哪儿 ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号（如1.1）或CommitID
+    2）CPU：预测若用CPU，请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库使用情况
+    3）GPU：预测若用GPU，请提供GPU型号、CUDA和CUDNN版本号
+    4）系统环境：请您描述系统类型、版本（如Mac OS 10.14），Python版本
+-预测信息
+    1）C++预测：请您提供预测库安装包的版本信息，及其中的version.txt文件
+    2）CMake包含路径的完整命令
+    3）API信息（如调用请提供）
+    4）预测库来源：官网下载/特殊环境（如BCLOUD编译）
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that th
+If there is no solution,please make sure that this is an inference issue including the following details :
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Python version
+-Cmake orders
+-C++version.txt
+-API information
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---installation-issue-.md b/.github/ISSUE_TEMPLATE/---installation-issue-.md
new file mode 100644
index 0000000000..ce4ba58932
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---installation-issue-.md
@@ -0,0 +1,40 @@
+---
+name: 安装（Installation Issue）
+about: 您可以提问安装、编译出现报错等问题。 You could use this template for reporting an installation
+   issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+建立issue时，为快速解决问题，请您根据使用情况给出如下信息：
+- 标题：请包含关键词“安装错误”/“编译错误”，例如“Mac编译错误”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号（如1.1）或CommitID
+    2）CPU：请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库的使用情况
+    3）GPU：请提供GPU型号，CUDA和CUDNN版本号
+    4）系统环境：请说明系统类型、版本（如Mac OS 10.14）、Python版本
+- 安装方式信息：
+1）pip安装/docker安装
+2）本地编译：请提供cmake命令，编译命令
+3）docker编译：请提供docker镜像，编译命令            
+  特殊环境请注明：如离线安装等
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in Github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is an installation issue including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg. Mac OS 10.14)
+-Python version
+- Install method: pip install/install with docker/build from source(without docker)/build within docker
+- Other special cases that you think may be related to this problem, eg. offline install, special internet condition   
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---model-issue-.md b/.github/ISSUE_TEMPLATE/---model-issue-.md
new file mode 100644
index 0000000000..7cb52f37b9
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---model-issue-.md
@@ -0,0 +1,36 @@
+---
+name: 模型（Model Issue）
+about: 您可以提问模型、算法、数据集方向的使用报错等问题。You could use this template for reporting a model/
+  algorithm/dataset  issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+建立issue时，为快速解决问题，请您根据使用情况给出如下信息：
+- 标题：简洁、精准描述您的问题，例如“ssd 模型前置lstm报错  ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU：请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库的使用情况
+    3）GPU：请提供GPU型号，CUDA和CUDNN版本号
+    4）系统环境：请说明系统类型、版本（例如Mac OS 10.14），Python版本
+- 模型信息
+    1）模型名称 2）使用数据集名称 3）使用算法名称 4）模型链接
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github.Probably there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is a issue of models including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Python version
+-Name of Models&Dataset/details of operator
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---others-.md b/.github/ISSUE_TEMPLATE/---others-.md
new file mode 100644
index 0000000000..6a291153e4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---others-.md
@@ -0,0 +1,33 @@
+---
+name: 其他（Others）
+about: 如上述分类未包含您的问题，可在此提出。 You could use this template for reporting other issues
+
+---
+
+为使您的问题得到快速解决，在建立Issues前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准概括您的问题
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU/GPU：如果您使用GPU训练，请提供GPU驱动版本、CUDA和cuDNN版本号
+    3）系统环境：请您描述系统类型、版本，例如Mac OS 10.14
+    4）Python版本号
+    5）显存信息
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please provide us with the following details :
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/cuDNN version
+-OS Platform and Distribution(eg.Mac OS 10.14)
+-Python version 
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---training-issue-.md b/.github/ISSUE_TEMPLATE/---training-issue-.md
new file mode 100644
index 0000000000..29e8383d97
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---training-issue-.md
@@ -0,0 +1,38 @@
+---
+name: 训练（Training issue）
+about: 您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training
+   issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issues前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准概括您的问题，例如“Insufficient Memory xxx" ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU：预测若用CPU，请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库使用情况
+    3）GPU：预测若用GPU，请提供GPU型号、CUDA和CUDNN版本号
+    4）系统环境：请您描述系统类型、版本，例如Mac OS 10.14，Python版本
+- 训练信息
+    1）单机/多机，单卡/多卡
+    2）显存信息
+    3）Operator信息
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志、可复现的代码片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is a training issue including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Other imformation: Distriuted training/informantion of operator/
+Graphics card storage
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**

From 36f08eef3b466001f339e2c33f47dac60bbc6821 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Fri, 23 Nov 2018 13:04:41 +0800
Subject: [PATCH 79/80] CUDA kernel for density_prior_box_op. (#14513)

* CUDA kernel for density_prior_box_op.
* Support flatten to 2D.
---
 paddle/fluid/API.spec                         |   2 +-
 paddle/fluid/framework/op_desc.cc             |   6 +
 .../fluid/operators/detection/CMakeLists.txt  |   2 +-
 .../detection/density_prior_box_op.cc         |  36 ++--
 .../detection/density_prior_box_op.cu         | 170 ++++++++++++++++++
 .../detection/density_prior_box_op.h          |  73 ++++----
 python/paddle/fluid/layers/detection.py       |  43 +++--
 python/paddle/fluid/tests/test_detection.py   |  60 ++++---
 .../unittests/test_density_prior_box_op.py    |  30 ++--
 9 files changed, 305 insertions(+), 117 deletions(-)
 create mode 100644 paddle/fluid/operators/detection/density_prior_box_op.cu

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 541c4db1fa..50114bf3df 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
 paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
-paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None))
+paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None))
 paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
 paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index fbaa169df6..362cda3f23 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
         this->attrs_[name] = std::vector<int>();
         break;
       }
+      case proto::AttrType::LONGS: {
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from LONGS to LONGS";
+        this->attrs_[name] = std::vector<int64_t>();
+        break;
+      }
       case proto::AttrType::FLOATS: {
         VLOG(110) << "SetAttr: " << Type() << ", " << name
                   << " from INTS to FLOATS";
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 58f6f48467..6c85f1577e 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -22,7 +22,7 @@ iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
-detection_library(density_prior_box_op SRCS density_prior_box_op.cc)
+detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
 anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc
index 99df15c322..1012ba3652 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -39,24 +39,27 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
     auto fixed_sizes = ctx->Attrs().Get<std::vector<float>>("fixed_sizes");
     auto fixed_ratios = ctx->Attrs().Get<std::vector<float>>("fixed_ratios");
     auto densities = ctx->Attrs().Get<std::vector<int>>("densities");
+    bool flatten = ctx->Attrs().Get<bool>("flatten_to_2d");
 
     PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(),
                       "The number of fixed_sizes and densities must be equal.");
     size_t num_priors = 0;
-    if ((fixed_sizes.size() > 0) && (densities.size() > 0)) {
-      for (size_t i = 0; i < densities.size(); ++i) {
-        if (fixed_ratios.size() > 0) {
-          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-        }
-      }
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+    }
+    if (!flatten) {
+      std::vector<int64_t> dim_vec(4);
+      dim_vec[0] = input_dims[2];
+      dim_vec[1] = input_dims[3];
+      dim_vec[2] = num_priors;
+      dim_vec[3] = 4;
+      ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+      ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+    } else {
+      int64_t dim0 = input_dims[2] * input_dims[3] * num_priors;
+      ctx->SetOutputDim("Boxes", {dim0, 4});
+      ctx->SetOutputDim("Variances", {dim0, 4});
     }
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_priors;
-    dim_vec[3] = 4;
-    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
-    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
   }
 
  protected:
@@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        platform::CPUPlace());
+        ctx.GetPlace());
   }
 };
 
@@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
         .SetDefault(true);
-
+    AddAttr<bool>("flatten_to_2d",
+                  "(bool) Whether to flatten to 2D and "
+                  "the second dim is 4.")
+        .SetDefault(false);
     AddAttr<float>(
         "step_w",
         "Density prior boxes step across width, 0.0 for auto calculation.")
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
new file mode 100644
index 0000000000..3b7c781795
--- /dev/null
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -0,0 +1,170 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/density_prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+static __device__ inline T Clip(T in) {
+  return min(max(in, 0.), 1.);
+}
+
+template <typename T>
+static __global__ void GenDensityPriorBox(
+    const int height, const int width, const int im_height, const int im_width,
+    const T offset, const T step_width, const T step_height,
+    const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin,
+    const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) {
+  int gidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int gidy = blockIdx.y * blockDim.y + threadIdx.y;
+  int step_x = blockDim.x * gridDim.x;
+  int step_y = blockDim.y * gridDim.y;
+
+  const T* width_ratio = ratios_shift;
+  const T* height_ratio = ratios_shift + num_priors;
+  const T* width_shift = ratios_shift + 2 * num_priors;
+  const T* height_shift = ratios_shift + 3 * num_priors;
+
+  for (int j = gidy; j < height; j += step_y) {
+    for (int i = gidx; i < width * num_priors; i += step_x) {
+      int h = j;
+      int w = i / num_priors;
+      int k = i % num_priors;
+
+      T center_x = (w + offset) * step_width;
+      T center_y = (h + offset) * step_height;
+
+      T center_x_temp = center_x + width_shift[k];
+      T center_y_temp = center_y + height_shift[k];
+
+      T box_width_ratio = width_ratio[k] / 2.;
+      T box_height_ratio = height_ratio[k] / 2.;
+
+      T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.);
+      T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.);
+      T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.);
+      T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.);
+
+      int out_offset = (j * width * num_priors + i) * 4;
+      out[out_offset] = is_clip ? Clip<T>(xmin) : xmin;
+      out[out_offset + 1] = is_clip ? Clip<T>(ymin) : ymin;
+      out[out_offset + 2] = is_clip ? Clip<T>(xmax) : xmax;
+      out[out_offset + 3] = is_clip ? Clip<T>(ymax) : ymax;
+
+      var[out_offset] = var_xmin;
+      var[out_offset + 1] = var_ymin;
+      var[out_offset + 2] = var_xmax;
+      var[out_offset + 3] = var_ymax;
+    }
+  }
+}
+
+template <typename T>
+class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto is_clip = ctx.Attr<bool>("clip");
+
+    auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
+    auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
+    auto densities = ctx.Attr<std::vector<int>>("densities");
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = 0;
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+    }
+    int step_average = static_cast<int>((step_width + step_height) * 0.5);
+
+    framework::Tensor h_temp;
+    T* tdata = h_temp.mutable_data<T>({num_priors * 4}, platform::CPUPlace());
+    int idx = 0;
+    for (size_t s = 0; s < fixed_sizes.size(); ++s) {
+      auto fixed_size = fixed_sizes[s];
+      int density = densities[s];
+      for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+        float ar = fixed_ratios[r];
+        int shift = step_average / density;
+        float box_width_ratio = fixed_size * sqrt(ar);
+        float box_height_ratio = fixed_size / sqrt(ar);
+        for (int di = 0; di < density; ++di) {
+          for (int dj = 0; dj < density; ++dj) {
+            float center_x_temp = shift / 2. + dj * shift - step_average / 2.;
+            float center_y_temp = shift / 2. + di * shift - step_average / 2.;
+            tdata[idx] = box_width_ratio;
+            tdata[num_priors + idx] = box_height_ratio;
+            tdata[2 * num_priors + idx] = center_x_temp;
+            tdata[3 * num_priors + idx] = center_y_temp;
+            idx++;
+          }
+        }
+      }
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor d_temp;
+    framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp);
+
+    // At least use 32 threads, at most 512 threads.
+    // blockx is multiple of 32.
+    int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L);
+    int gridx = (feature_width * num_priors + blockx - 1) / blockx;
+    dim3 threads(blockx, 1);
+    dim3 grids(gridx, feature_height);
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+    GenDensityPriorBox<T><<<grids, threads, 0, stream>>>(
+        feature_height, feature_width, img_height, img_width, offset,
+        step_width, step_height, num_priors, d_temp.data<T>(), is_clip,
+        variances[0], variances[1], variances[2], variances[3],
+        boxes->data<T>(), vars->data<T>());
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(density_prior_box,
+                        ops::DensityPriorBoxOpCUDAKernel<float>,
+                        ops::DensityPriorBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
index 9a52077e9c..ed2f5df80c 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
       step_height = step_h;
     }
     int num_priors = 0;
-    if (fixed_sizes.size() > 0 && densities.size() > 0) {
-      for (size_t i = 0; i < densities.size(); ++i) {
-        if (fixed_ratios.size() > 0) {
-          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-        }
-      }
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
     }
 
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
 
+    auto box_dim = vars->dims();
+    boxes->Resize({feature_height, feature_width, num_priors, 4});
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
     int step_average = static_cast<int>((step_width + step_height) * 0.5);
 
     for (int h = 0; h < feature_height; ++h) {
@@ -76,36 +74,34 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
           auto fixed_size = fixed_sizes[s];
           int density = densities[s];
           // Generate density prior boxes with fixed ratios.
-          if (fixed_ratios.size() > 0) {
-            for (size_t r = 0; r < fixed_ratios.size(); ++r) {
-              float ar = fixed_ratios[r];
-              int shift = step_average / density;
-              float box_width_ratio = fixed_size * sqrt(ar);
-              float box_height_ratio = fixed_size / sqrt(ar);
-              for (int di = 0; di < density; ++di) {
-                for (int dj = 0; dj < density; ++dj) {
-                  float center_x_temp =
-                      center_x - step_average / 2. + shift / 2. + dj * shift;
-                  float center_y_temp =
-                      center_y - step_average / 2. + shift / 2. + di * shift;
-                  e_boxes(h, w, idx, 0) =
-                      (center_x_temp - box_width_ratio / 2.) / img_width >= 0
-                          ? (center_x_temp - box_width_ratio / 2.) / img_width
-                          : 0;
-                  e_boxes(h, w, idx, 1) =
-                      (center_y_temp - box_height_ratio / 2.) / img_height >= 0
-                          ? (center_y_temp - box_height_ratio / 2.) / img_height
-                          : 0;
-                  e_boxes(h, w, idx, 2) =
-                      (center_x_temp + box_width_ratio / 2.) / img_width <= 1
-                          ? (center_x_temp + box_width_ratio / 2.) / img_width
-                          : 1;
-                  e_boxes(h, w, idx, 3) =
-                      (center_y_temp + box_height_ratio / 2.) / img_height <= 1
-                          ? (center_y_temp + box_height_ratio / 2.) / img_height
-                          : 1;
-                  idx++;
-                }
+          for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+            float ar = fixed_ratios[r];
+            int shift = step_average / density;
+            float box_width_ratio = fixed_size * sqrt(ar);
+            float box_height_ratio = fixed_size / sqrt(ar);
+            for (int di = 0; di < density; ++di) {
+              for (int dj = 0; dj < density; ++dj) {
+                float center_x_temp =
+                    center_x - step_average / 2. + shift / 2. + dj * shift;
+                float center_y_temp =
+                    center_y - step_average / 2. + shift / 2. + di * shift;
+                e_boxes(h, w, idx, 0) =
+                    (center_x_temp - box_width_ratio / 2.) / img_width >= 0
+                        ? (center_x_temp - box_width_ratio / 2.) / img_width
+                        : 0;
+                e_boxes(h, w, idx, 1) =
+                    (center_y_temp - box_height_ratio / 2.) / img_height >= 0
+                        ? (center_y_temp - box_height_ratio / 2.) / img_height
+                        : 0;
+                e_boxes(h, w, idx, 2) =
+                    (center_x_temp + box_width_ratio / 2.) / img_width <= 1
+                        ? (center_x_temp + box_width_ratio / 2.) / img_width
+                        : 1;
+                e_boxes(h, w, idx, 3) =
+                    (center_y_temp + box_height_ratio / 2.) / img_height <= 1
+                        ? (center_y_temp + box_height_ratio / 2.) / img_height
+                        : 1;
+                idx++;
               }
             }
           }
@@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
     e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
 
     vars->Resize(var_dim);
+    boxes->Resize(box_dim);
   }
 };  // namespace operators
 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3f17400a14..4843af8340 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -1029,6 +1029,7 @@ def density_prior_box(input,
                       clip=False,
                       steps=[0.0, 0.0],
                       offset=0.5,
+                      flatten_to_2d=False,
                       name=None):
     """
     **Density Prior Box Operator**
@@ -1065,22 +1066,24 @@ def density_prior_box(input,
             height/weight of the input will be automatically calculated.
             Default: [0., 0.]
        offset(float): Prior boxes center offset. Default: 0.5
+       flatten_to_2d(bool): Whether to flatten output prior boxes and variance
+           to 2D shape, the second dim is 4. Default: False.
        name(str): Name of the density prior box op. Default: None.
 
     Returns:
         tuple: A tuple with two Variable (boxes, variances)
 
         boxes: the output density prior boxes of PriorBox.
-        The layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input,
-        num_priors is the total
-        box count of each position of input.
+            The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
+            The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
+            H is the height of input, W is the width of input,
+            num_priors is the total box count of each position of input.
 
         variances: the expanded variances of PriorBox.
-        The layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input
-        num_priors is the total
-        box count of each position of input
+            The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
+            The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
+            H is the height of input, W is the width of input
+            num_priors is the total box count of each position of input.
 
 
     Examples:
@@ -1089,14 +1092,11 @@ def density_prior_box(input,
             box, var = fluid.layers.density_prior_box(
                 input=conv1,
                 image=images,
-                min_sizes=[100.],
-                max_sizes=[200.],
-                aspect_ratios=[1.0, 1.0 / 2.0, 2.0],
-                densities=[3, 4],
-                fixed_sizes=[50., 60.],
-                fixed_ratios=[1.0, 3.0, 1.0 / 3.0],
-                flip=True,
-                clip=True)
+                densities=[4, 2, 1],
+                fixed_sizes=[32.0, 64.0, 128.0],
+                fixed_ratios=[1.],
+                clip=True,
+                flatten_to_2d=True)
     """
     helper = LayerHelper("density_prior_box", **locals())
     dtype = helper.input_dtype()
@@ -1127,14 +1127,11 @@ def density_prior_box(input,
         'step_w': steps[0],
         'step_h': steps[1],
         'offset': offset,
+        'densities': densities,
+        'fixed_sizes': fixed_sizes,
+        'fixed_ratios': fixed_ratios,
+        'flatten_to_2d': flatten_to_2d,
     }
-    if densities is not None and len(densities) > 0:
-        attrs['densities'] = densities
-    if fixed_sizes is not None and len(fixed_sizes) > 0:
-        attrs['fixed_sizes'] = fixed_sizes
-    if fixed_ratios is not None and len(fixed_ratios) > 0:
-        attrs['fixed_ratios'] = fixed_ratios
-
     box = helper.create_variable_for_type_inference(dtype)
     var = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 982d291801..a2eca5541a 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -112,38 +112,42 @@ class TestDetection(unittest.TestCase):
 
 class TestPriorBox(unittest.TestCase):
     def test_prior_box(self):
-        data_shape = [3, 224, 224]
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-        box, var = layers.prior_box(
-            input=conv1,
-            image=images,
-            min_sizes=[100.0],
-            aspect_ratios=[1.],
-            flip=True,
-            clip=True)
-        assert len(box.shape) == 4
-        assert box.shape == var.shape
-        assert box.shape[3] == 4
+        program = Program()
+        with program_guard(program):
+            data_shape = [3, 224, 224]
+            images = fluid.layers.data(
+                name='pixel', shape=data_shape, dtype='float32')
+            conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+            box, var = layers.prior_box(
+                input=conv1,
+                image=images,
+                min_sizes=[100.0],
+                aspect_ratios=[1.],
+                flip=True,
+                clip=True)
+            assert len(box.shape) == 4
+            assert box.shape == var.shape
+            assert box.shape[3] == 4
 
 
 class TestDensityPriorBox(unittest.TestCase):
     def test_density_prior_box(self):
-        data_shape = [3, 224, 224]
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-        box, var = layers.density_prior_box(
-            input=conv1,
-            image=images,
-            densities=[3, 4],
-            fixed_sizes=[50., 60.],
-            fixed_ratios=[1.0],
-            clip=True)
-        assert len(box.shape) == 4
-        assert box.shape == var.shape
-        assert box.shape[3] == 4
+        program = Program()
+        with program_guard(program):
+            data_shape = [3, 224, 224]
+            images = fluid.layers.data(
+                name='pixel', shape=data_shape, dtype='float32')
+            conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+            box, var = layers.density_prior_box(
+                input=conv1,
+                image=images,
+                densities=[3, 4],
+                fixed_sizes=[50., 60.],
+                fixed_ratios=[1.0],
+                clip=True)
+            assert len(box.shape) == 4
+            assert box.shape == var.shape
+            assert box.shape[-1] == 4
 
 
 class TestAnchorGenerator(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
index 79d1fd3d71..4b0bc1dcf8 100644
--- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
@@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest):
             'offset': self.offset,
             'densities': self.densities,
             'fixed_sizes': self.fixed_sizes,
-            'fixed_ratios': self.fixed_ratios
+            'fixed_ratios': self.fixed_ratios,
+            'flatten_to_2d': self.flatten_to_2d
         }
         self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
 
@@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest):
         self.set_data()
 
     def set_density(self):
-        self.densities = []
-        self.fixed_sizes = []
-        self.fixed_ratios = []
+        self.densities = [4, 2, 1]
+        self.fixed_sizes = [32.0, 64.0, 128.0]
+        self.fixed_ratios = [1.0]
+        self.layer_w = 17
+        self.layer_h = 17
+        self.image_w = 533
+        self.image_h = 533
+        self.flatten_to_2d = False
 
     def init_test_params(self):
-        self.layer_w = 32
-        self.layer_h = 32
-
-        self.image_w = 40
-        self.image_h = 40
+        self.set_density()
 
         self.step_w = float(self.image_w) / float(self.layer_w)
         self.step_h = float(self.image_h) / float(self.layer_h)
@@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest):
         self.variances = [0.1, 0.1, 0.2, 0.2]
         self.variances = np.array(self.variances, dtype=np.float).flatten()
 
-        self.set_density()
-
         self.clip = True
         self.num_priors = 0
         if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
@@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest):
                           (self.layer_h, self.layer_w, self.num_priors, 1))
         self.out_boxes = out_boxes.astype('float32')
         self.out_var = out_var.astype('float32')
+        if self.flatten_to_2d:
+            self.out_boxes = self.out_boxes.reshape((-1, 4))
+            self.out_var = self.out_var.reshape((-1, 4))
 
 
 class TestDensityPriorBox(TestDensityPriorBoxOp):
@@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp):
         self.densities = [3, 4]
         self.fixed_sizes = [1.0, 2.0]
         self.fixed_ratios = [1.0]
+        self.layer_w = 32
+        self.layer_h = 32
+        self.image_w = 40
+        self.image_h = 40
+        self.flatten_to_2d = True
 
 
 if __name__ == '__main__':

From 61c5f13fcf92c18f30c05a90e3d3badd884f9340 Mon Sep 17 00:00:00 2001
From: sabreshao <sabre.shao@amd.com>
Date: Fri, 23 Nov 2018 14:27:39 +0800
Subject: [PATCH 80/80] Fix cmake for AMDGPU platform (#13801)

* HIP cmake.
Enable whole archieve build for pybind library.

Disable two warning.

Rollback to C++11.

Link RCCL to WA gpu kernel loading issue.

Update eigen to fix build failure.

Add more include directories.

Fix O3 build failure.

Update eigen.

fix tensor_util_test segment fault issue

add more macro check in hip.cmake.
we may consider refine hip.cmake to inherit all add_definitions() in parrent scope, in the future.

Fix rocRAND load.

Update eigen to fix gru_unit_op and reduce_op.

Add HIP support to testing.

Update eigen to support int16 and int8 in arg min and arg max.

* add rocprim as cub library used by nv implementation

* Reduce build time in rocprim.

* Add rocprim introduction, remove useless cmake code.

* Remove useless flags and format cmake file.
---
 CMakeLists.txt                      |  1 +
 cmake/external/eigen.cmake          |  2 +-
 cmake/external/rocprim.cmake        | 44 +++++++++++++++++++++++++++++
 cmake/flags.cmake                   |  3 ++
 cmake/generic.cmake                 | 26 +++++++++--------
 cmake/hip.cmake                     | 32 +++++++++++++++++----
 paddle/fluid/pybind/CMakeLists.txt  |  4 +--
 paddle/testing/paddle_gtest_main.cc |  2 +-
 8 files changed, 94 insertions(+), 20 deletions(-)
 create mode 100644 cmake/external/rocprim.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3059ab7e0e..8dcf9786e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,6 +204,7 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
+include(external/rocprim)
 include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 573ad5e5f0..6aef97f212 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -17,7 +17,7 @@ if(WITH_AMD_GPU)
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+        GIT_TAG         7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
         PREFIX          ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake
new file mode 100644
index 0000000000..914c064918
--- /dev/null
+++ b/cmake/external/rocprim.cmake
@@ -0,0 +1,44 @@
+if (NOT WITH_AMD_GPU)
+    return()
+endif()
+
+# rocprim is "ROCm Parallel Primitives" for short.
+# It is a header-only library providing HIP and HC parallel primitives
+# for developing performant GPU-accelerated code on AMD ROCm platform.
+
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+
+INCLUDE(ExternalProject)
+
+SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim)
+SET(ROCPRIM_INSTALL_DIR  ${THIRD_PARTY_PATH}/install/rocprim)
+SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include)
+
+ExternalProject_Add(
+    extern_rocprim
+    GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
+    GIT_TAG        5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc 
+    PREFIX         ${ROCPRIM_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS     -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc
+    CMAKE_ARGS     -DONLY_INSTALL=ON
+    CMAKE_ARGS     -DBUILD_TEST=OFF
+    CMAKE_ARGS     -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR}
+
+    INSTALL_DIR    ${ROCPRIM_INSTALL_DIR}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+)
+
+INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR})
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";")
+    add_library(rocprim STATIC ${dummyfile})
+else()
+    add_library(rocprim INTERFACE)
+endif()
+
+add_dependencies(rocprim extern_rocprim)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 343e44ab4b..c4472040ce 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -129,6 +129,9 @@ set(COMMON_FLAGS
     -Wno-error=parentheses-equality # Warnings in pybind11
     -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
     -Wno-error=terminate  # Warning in PADDLE_ENFORCE
+    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
+    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
+    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
 )
 
 set(GPU_COMMON_FLAGS
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 111627a932..7d803d00ef 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -454,25 +454,29 @@ function(hip_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
         set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
-        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
-	find_fluid_modules(${TARGET_NAME})
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so)
+        find_fluid_modules(${TARGET_NAME})
       endif()
-      if (hip_library_DEPS)
-	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
-	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
+        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+        target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+        list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END)
+      else()
+        target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
       endif()
       # cpplint code style
       foreach(source_file ${hip_library_SRCS})
-	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
-	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-	endif()
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
       endforeach()
     else(hip_library_SRCS)
       if (hip_library_DEPS)
-	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+        merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
       else()
-	message(FATAL "Please specify source file or library in nv_library.")
+        message(FATAL "Please specify source file or library in nv_library.")
       endif()
     endif(hip_library_SRCS)
   endif()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index bfe491bd6b..4276bc5b08 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU)
 endif()
 
 include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hip/include")
+include_directories("/opt/rocm/miopen/include")
 include_directories("/opt/rocm/hipblas/include")
 include_directories("/opt/rocm/hiprand/include")
 include_directories("/opt/rocm/rocrand/include")
@@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust")
 
 list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
 
-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
 
 if(WITH_DSO)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
 endif(WITH_DSO)
 
-if(WITH_DOUBLE)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
-endif(WITH_DOUBLE)
-
 if(WITH_TESTING)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
 endif(WITH_TESTING)
 
+if(WITH_DISTRIBUTE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
+endif(WITH_DISTRIBUTE)
+
+if(WITH_GRPC)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
+endif(WITH_GRPC)
+
+if(NOT WITH_GOLANG)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
+endif(NOT WITH_GOLANG)
+
+if(WITH_MKLDNN)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
+endif(WITH_MKLDNN)
+
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
+
+if(NOT WITH_RDMA)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
+endif(NOT WITH_RDMA)
+
+
+
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
     list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index fb6ee2f4a5..25d241d976 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -5,8 +5,8 @@ if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS ${PYBIND_SRCS}
-      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+      DEPS ARCHIVE_START ${PYBIND_DEPS}
+      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END)
   else()
     cc_library(paddle_pybind SHARED
       SRCS ${PYBIND_SRCS}
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 598f435461..babb862122 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -28,7 +28,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < argc; ++i) {
     new_argv.push_back(argv[i]);
   }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
 #else