From 16bc8f2a755438cac5a279f27b082dbaf0e3e3a0 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 19 Nov 2018 21:52:59 +0800
Subject: [PATCH 01/36] Add debug info

---
 .dockerignore |  1 +
 Dockerfile    | 86 +++++++++++++++++++++++++--------------------------
 2 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 2b2e74053d..49adfe4f0a 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,5 +1,6 @@
 *.DS_Store
 build/
+build*
 *.user
 .vscode
 .idea
diff --git a/Dockerfile b/Dockerfile
index c8b9eed6d6..b36102175c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -71,46 +71,46 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 install -U wheel && \
-    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
-    easy_install -U pip && \
-    pip install -U pip setuptools wheel && \
-    pip install -U docopt PyYAML sphinx==1.5.6 && \
-    pip install sphinx-rtd-theme==0.1.9 recommonmark
-
-RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 install opencv-python && \
-    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install opencv-python
-
-#For docstring checker
-RUN pip3 install pylint pytest astroid isort
-RUN pip install pylint pytest astroid isort LinkChecker
-
-COPY ./python/requirements.txt /root/
-RUN pip3 install -r /root/requirements.txt
-RUN pip install -r /root/requirements.txt
-
-# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
-# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-RUN apt-get install -y libssl-dev libffi-dev
-RUN pip3 install certifi urllib3[secure]
-RUN pip install certifi urllib3[secure]
-
-
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
-
-# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-EXPOSE 22
+# RUN pip3 install -U wheel && \
+    # pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    # easy_install -U pip && \
+    # pip install -U pip setuptools wheel && \
+    # pip install -U docopt PyYAML sphinx==1.5.6 && \
+    # pip install sphinx-rtd-theme==0.1.9 recommonmark
+
+# RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    # pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    # pip3 install opencv-python && \
+    # pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    # pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    # pip install opencv-python
+
+# #For docstring checker
+# RUN pip3 install pylint pytest astroid isort
+# RUN pip install pylint pytest astroid isort LinkChecker
+
+# COPY ./python/requirements.txt /root/
+# RUN pip3 install -r /root/requirements.txt
+# RUN pip install -r /root/requirements.txt
+
+# # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
+# # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
+# RUN apt-get install -y libssl-dev libffi-dev
+# RUN pip3 install certifi urllib3[secure]
+# RUN pip install certifi urllib3[secure]
+
+
+# # Install woboq_codebrowser to /woboq
+# RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
+    # (cd /woboq \
+     # cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+           # -DCMAKE_BUILD_TYPE=Release . \
+     # make)
+
+# # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+# RUN mkdir /var/run/sshd
+# RUN echo 'root:root' | chpasswd
+# RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+# RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+# EXPOSE 22

From c19ff1f3d28b38867de8b98d63f19b8c759c4535 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 15:37:36 +0800
Subject: [PATCH 02/36] Add python3.6 and python3.7 support in padde build
 scripts

test=develop
---
 paddle/scripts/paddle_build.sh | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 32f9bca645..569e56e5a9 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -116,6 +116,18 @@ function cmake_gen() {
                 export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
+            elif [ "$1" == "cp36-cp36m" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
+            elif [ "$1" == "cp37-cp37m" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
            fi
         fi
     fi
@@ -419,7 +431,7 @@ function assert_api_not_changed() {
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
-    if [ "$1" == "cp35-cp35m" ]; then
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' new.spec
         sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec

From 255cc1eb6540785c8cb786a6c9f291fa53010ca0 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 15:43:17 +0800
Subject: [PATCH 03/36] Add support for Mac build

test=develop
---
 paddle/scripts/paddle_build.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 569e56e5a9..9632eaec00 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -94,6 +94,30 @@ function cmake_gen() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp36-cp36m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
+        elif [ "$1" == "cp37-cp37m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
         fi
     else
         if [ "$1" != "" ]; then

From 3e3599f3d937e0444606056f3c9f2261b74dfd93 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Wed, 21 Nov 2018 11:31:04 +0000
Subject: [PATCH 04/36] Refine split tensorrt plugin

---
 .../inference/tensorrt/convert/split_op.cc    |   3 +-
 .../tensorrt/plugin/split_op_plugin.cu        | 157 ++++++++++++++----
 .../tensorrt/plugin/split_op_plugin.h         |   9 +-
 3 files changed, 134 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 6620c76318..871354267e 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -40,7 +40,7 @@ class SplitOpConverter : public OpConverter {
     int axis = boost::get<int>(op_desc.GetAttr("axis"));
     std::vector<int> output_lengths =
         boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
-    PADDLE_ENFORCE(axis != 0);
+    // PADDLE_ENFORCE(axis != 0);
     if (axis < 0) {
       axis += input_dims.nbDims;
     } else {
@@ -48,7 +48,6 @@ class SplitOpConverter : public OpConverter {
     }
 
     PADDLE_ENFORCE(output_lengths.size() == output_num);
-
     //
     plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
     nvinfer1::IPluginLayer* layer =
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 4adea2db1e..1ec0753e9f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cuda_fp16.h>
+#include <algorithm>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
 namespace paddle {
@@ -19,6 +21,52 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+// copied from operators::math::SplitFunctor
+template <typename T>
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int* out_cols,
+                            int out_cols_size, T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int curr_segment = 0;
+  int curr_offset = out_cols[0];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = out_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs_data[curr_segment];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
+template <typename T>
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int fixed_out_col,
+                            T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
 nvinfer1::Dims SplitPlugin::getOutputDimensions(
     int index, const nvinfer1::Dims* input_dims, int num_inputs) {
   PADDLE_ENFORCE_EQ(num_inputs, 1);
@@ -31,48 +79,95 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(
 
 int SplitPlugin::initialize() {
   PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
-
+  // notice input dims is [C, H, W]
+  nvinfer1::Dims dims = this->getInputDims(0);
+  outer_rows_ = 1;
+  inner_cols_ = 1;
+  for (int i = 0; i < axis_; ++i) {
+    outer_rows_ *= dims.d[i];
+  }
+  for (int i = axis_ + 1; i < dims.nbDims; ++i) {
+    inner_cols_ *= dims.d[i];
+  }
+  same_shape_ = true;
   std::vector<int> segment_offsets(1, 0);
   for (int i = 0; i < this->getNbOutputs(); ++i) {
-    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
+    if (output_length_[i] != output_length_[0]) {
+      same_shape_ = false;
+    }
+    segment_offsets.push_back(segment_offsets.back() +
+                              output_length_[i] * inner_cols_);
   }
-  segment_offsets_ = segment_offsets;
-  nvinfer1::Dims dims = this->getInputDims(0);
-  nx_ = 1;
-  for (int i = dims.nbDims - 1; i > axis_; --i) {
-    nx_ *= dims.d[i];
+  inner_cols_ *= dims.d[axis_];
+  d_segment_offsets_ = segment_offsets;
+  segment_offsets_ = std::move(segment_offsets);
+  d_output_ptrs_.resize(this->getNbOutputs(), nullptr);
+  return 0;
+}
+
+template <typename T>
+inline void Split(cudaStream_t stream, const bool same_shape,
+                  const int outer_rows, const int inner_cols,
+                  const std::vector<int>& segment_offsets,
+                  const int* d_segment_offsets, const T* input, T** outputs) {
+  const int kThreadsPerBlock = 1024;
+  const int kMaxBlocks = 65535;
+  int block_cols = kThreadsPerBlock;
+  if (inner_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    block_cols = ((inner_cols + 31) >> 5) << 5;
   }
-  ny_ = dims.d[axis_];
-  nz_ = 1;
-  for (int i = axis_ - 1; i >= 0; --i) {
-    nz_ *= dims.d[i];
+  int block_rows = kThreadsPerBlock / block_cols;
+  dim3 block_size = dim3(block_cols, block_rows, 1);
+
+  int grid_cols =
+      std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks);
+  int grid_rows =
+      std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1));
+  dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+
+  if (same_shape) {
+    SplitKernel<<<grid_size, block_size, 0, stream>>>(
+        input, outer_rows, inner_cols, segment_offsets[1], outputs);
+  } else {
+    SplitKernel<<<grid_size, block_size, 0, stream>>>(
+        input, outer_rows, inner_cols, d_segment_offsets,
+        static_cast<int>(segment_offsets.size()), outputs);
   }
-  return 0;
 }
 
 int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                          void** outputs, void* workspace, cudaStream_t stream) {
-  auto const& input_dims = this->getInputDims(0);
-  int input_size = 0;
-  float const* idata = reinterpret_cast<float const*>(inputs[0]);
-  float** odatas = reinterpret_cast<float**>(outputs);
-
-  // kernel impl here.
-  int inputBatchOffset = nx_ * ny_ * nz_;
-  for (size_t i = 0; i < this->getNbOutputs(); i++) {
-    for (size_t j = 0; j < batchSize; j++) {
-      cudaMemcpyAsync(
-          odatas[i] +
-              j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ *
-                  sizeof(float),
-          inputs[0] +
-              (inputBatchOffset * j + segment_offsets_[i] * nx_) *
-                  sizeof(float),
-          (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float),
-          cudaMemcpyDeviceToDevice, stream);
+  float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
+  if (axis_ == -1 && this->getNbOutputs() < 10) {
+    float** output_ptrs = reinterpret_cast<float**>(outputs);
+    int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT)
+                             ? sizeof(__half)
+                             : sizeof(float);
+    for (int i = 0; i < this->getNbOutputs(); ++i) {
+      PADDLE_ENFORCE(
+          cudaMemcpyAsync(
+              output_ptrs[i], input_ptr + segment_offsets_[i],
+              (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size,
+              cudaMemcpyDeviceToDevice, stream) == cudaSuccess);
+    }
+  } else {
+    outer_rows_ *= batchSize;
+    const int* d_segment_offsets_ptr =
+        thrust::raw_pointer_cast(&d_segment_offsets_[0]);
+    float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
+    PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs,
+                                   this->getNbOutputs() * sizeof(float*),
+                                   cudaMemcpyHostToDevice,
+                                   stream) == cudaSuccess);
+    if (this->getDataType() == nvinfer1::DataType::kFLOAT) {
+      Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
+            d_segment_offsets_ptr, input_ptr, output_ptrs);
+    } else {
+      Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
+            d_segment_offsets_ptr, (__half*)input_ptr,  // NOLINT
+            (__half**)output_ptrs);                     // NOLINT
     }
   }
-
   return cudaGetLastError() != cudaSuccess;
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index b5b6e69992..6f028d3d72 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <thrust/device_vector.h>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
@@ -25,7 +26,7 @@ namespace plugin {
 class SplitPlugin : public PluginTensorRT {
  public:
   SplitPlugin(int axis, std::vector<int> const &output_lengths)
-      : axis_(axis), output_length_(output_lengths) {}
+      : axis_(axis), same_shape_(true), output_length_(output_lengths) {}
 
   SplitPlugin(void const *serial_data, size_t serial_length) {
     deserializeBase(serial_data, serial_length);
@@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT {
   }
 
   int axis_;
+  int outer_rows_;
+  int inner_cols_;
+  bool same_shape_;
   std::vector<int> output_length_;
-  int nx_, ny_, nz_;
   std::vector<int> segment_offsets_;
+  thrust::device_vector<int> d_segment_offsets_;
+  thrust::device_vector<float *> d_output_ptrs_;
 };
 
 }  // namespace plugin

From 6eba5bd276a8d79d5611ec42db0c47273fb4950c Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Wed, 21 Nov 2018 15:32:25 +0000
Subject: [PATCH 05/36] Fix direct copy and refine split ut test=develop

---
 .../tensorrt/convert/test_split_op.cc         | 55 ++++++++++++++-----
 .../tensorrt/plugin/split_op_plugin.cu        |  7 ++-
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
index f81d011552..23909378dd 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -20,30 +20,59 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(split_op, test) {
+template <int BatchSize, int Axis>
+void TensorRTSplitTest(const std::vector<int> &in_shape,
+                       const std::vector<int> &sections) {
   std::unordered_set<std::string> parameters({""});
   framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2));
-  validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2));
-  validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2));
+  TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000);
+
+  auto make_dim = [](const std::vector<int> &shape) {
+    nvinfer1::DimsCHW dim;
+    dim.c() = shape[0];
+    dim.h() = shape[1];
+    dim.w() = shape[2];
+    return dim;
+  };
+  validator.DeclInputVar("split_input", make_dim(in_shape));
+  std::vector<std::string> output_vars;
+  for (size_t i = 0; i < sections.size(); ++i) {
+    auto out_shape = in_shape;
+    out_shape[Axis - 1] = sections[i];
+    std::string output_name = "split_out" + std::to_string(i);
+    validator.DeclOutputVar(output_name, make_dim(out_shape));
+    output_vars.push_back(output_name);
+  }
 
   // Prepare Op description
   framework::OpDesc desc;
   desc.SetType("split");
   desc.SetInput("X", {"split_input"});
-  desc.SetOutput("Out", {"split_out1", "split_out2"});
+  desc.SetOutput("Out", output_vars);
 
-  int num = 0;
-  int axis = 1;
-  std::vector<int> output_lengths = {2, 1};
-  desc.SetAttr("axis", axis);
-  desc.SetAttr("num", num);
-  desc.SetAttr("sections", output_lengths);
+  desc.SetAttr("axis", Axis);
+  desc.SetAttr("num", 0);
+  desc.SetAttr("sections", sections);
 
   validator.SetOp(*desc.Proto());
 
-  validator.Execute(1);
+  validator.Execute(BatchSize);
+}
+
+TEST(split_op, test_same_shape_batch1) {
+  TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2});
+}
+
+TEST(split_op, test_different_shape_batch1) {
+  TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1});
+}
+
+TEST(split_op, test_same_shape_batch10) {
+  TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2});
+}
+
+TEST(split_op, test_different_shape_batch10) {
+  TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1});
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 1ec0753e9f..de61ace59e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -138,11 +138,12 @@ inline void Split(cudaStream_t stream, const bool same_shape,
 int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                          void** outputs, void* workspace, cudaStream_t stream) {
   float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
-  if (axis_ == -1 && this->getNbOutputs() < 10) {
+  if (((batchSize == 1 && axis_ == 0) || axis_ == -1) &&
+      this->getNbOutputs() < 10) {
     float** output_ptrs = reinterpret_cast<float**>(outputs);
     int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT)
-                             ? sizeof(__half)
-                             : sizeof(float);
+                             ? sizeof(float)
+                             : sizeof(__half);
     for (int i = 0; i < this->getNbOutputs(); ++i) {
       PADDLE_ENFORCE(
           cudaMemcpyAsync(

From 3912545ffec3ea5a850420f0a804afadc9f0352a Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 22 Nov 2018 04:30:19 +0000
Subject: [PATCH 06/36] add dlpack support test=develop

---
 CMakeLists.txt                               |   1 +
 cmake/external/dlpack.cmake                  |  31 +++++
 paddle/fluid/framework/CMakeLists.txt        |   3 +
 paddle/fluid/framework/dlpack_tensor.cc      | 127 +++++++++++++++++++
 paddle/fluid/framework/dlpack_tensor.h       |  45 +++++++
 paddle/fluid/framework/dlpack_tensor_test.cc | 113 +++++++++++++++++
 6 files changed, 320 insertions(+)
 create mode 100644 cmake/external/dlpack.cmake
 create mode 100644 paddle/fluid/framework/dlpack_tensor.cc
 create mode 100644 paddle/fluid/framework/dlpack_tensor.h
 create mode 100644 paddle/fluid/framework/dlpack_tensor_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c62cc9bfd7..b6ae241272 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -190,6 +190,7 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 include(external/xxhash)    # download xxhash
+include(external/dlpack)
 
 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
new file mode 100644
index 0000000000..94d8fcc668
--- /dev/null
+++ b/cmake/external/dlpack.cmake
@@ -0,0 +1,31 @@
+include(ExternalProject)
+
+set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack)
+set(DLPACK_INCLUDE_DIR ${DLPACK_SOURCE_DIR}/src/extern_dlpack/include)
+
+include_directories(${DLPACK_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_dlpack
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/dmlc/dlpack.git"
+  GIT_TAG        "v0.2"
+  PREFIX         ${DLPACK_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/dlpack_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(dlpack STATIC ${dummyfile})
+else()
+  add_library(dlpack INTERFACE)
+endif()
+
+add_dependencies(dlpack extern_dlpack)
+
+LIST(APPEND externl_project_dependencies dlpack)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index cb9057672c..d7d7834b49 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -205,3 +205,6 @@ cc_test(tuple_test SRCS tuple_test.cc )
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
+
+cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
+cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
new file mode 100644
index 0000000000..04e3f78afe
--- /dev/null
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/dlpack_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+namespace internal {
+template <typename T>
+static ::DLDataType GetDLDataTypeCode() {
+  ::DLDataType dtype;
+  if (std::is_same<T, platform::float16>::value ||
+      std::is_floating_point<T>::value) {
+    dtype.code = kDLFloat;
+  } else if (std::is_unsigned<T>::value) {
+    dtype.code = kDLUInt;
+  } else if (std::is_integral<T>::value) {
+    dtype.code = kDLInt;
+  } else {
+    PADDLE_THROW("Unsupported data type %s", typeid(T).name());
+  }
+  dtype.bits = 8 * sizeof(T);
+  dtype.lanes = 1;
+  return dtype;
+}
+
+static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) {
+#define REG_DL_DATA_TYPE(type) \
+  { std::type_index(typeid(type)), GetDLDataTypeCode<type>() }
+  static const std::unordered_map<std::type_index, ::DLDataType>
+      type_to_dtype_map({
+          REG_DL_DATA_TYPE(platform::float16),  // NOLINT
+          REG_DL_DATA_TYPE(float),              // NOLINT
+          REG_DL_DATA_TYPE(double),             // NOLINT
+          REG_DL_DATA_TYPE(int),                // NOLINT
+          REG_DL_DATA_TYPE(int64_t),            // NOLINT
+          REG_DL_DATA_TYPE(bool),               // NOLINT
+          REG_DL_DATA_TYPE(size_t),             // NOLINT
+          REG_DL_DATA_TYPE(int16_t),            // NOLINT
+          REG_DL_DATA_TYPE(uint8_t),            // NOLINT
+          REG_DL_DATA_TYPE(int8_t)              // NOLINT
+      });
+  static auto type_to_dtype_map_end_it = type_to_dtype_map.end();
+  auto it = type_to_dtype_map.find(type);
+  PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s",
+                 type.name());
+  return it->second;
+#undef REG_DL_DATA_TYPE
+}
+
+struct DLContextVisitor : public boost::static_visitor<::DLContext> {
+  inline ::DLContext operator()(const platform::CPUPlace &place) const {
+    DLContext ctx;
+    ctx.device_type = kDLCPU;
+    ctx.device_id = 0;
+    return ctx;
+  }
+
+  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
+#ifdef PADDLE_WITH_CUDA
+    DLContext ctx;
+    ctx.device_type = kDLGPU;
+    ctx.device_id = place.device;
+    return ctx;
+#else
+    PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version");
+#endif
+  }
+
+  inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
+#ifdef PADDLE_WITH_CUDA
+    DLContext ctx;
+    ctx.device_type = kDLCPUPinned;
+    ctx.device_id = 0;
+    return ctx;
+#else
+    PADDLE_THROW(
+        "platform::CUDAPinnedPlace is not supported in CPU only version");
+#endif
+  }
+};
+}  // namespace internal
+
+DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
+  // init data, data buffer
+  t_.data = const_cast<void *>(tensor.data<void>());
+
+  // init ctx, DLContext type with device_type and device_id
+  auto place = tensor.place();
+  t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place);
+
+  // init dtype
+  t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type());
+  t_.dtype.lanes = lanes;
+
+  // init ndim, tensor rank
+  auto &dims = tensor.dims();
+  using DimType = decltype(t_.ndim);  // int
+  t_.ndim = static_cast<DimType>(dims.size());
+
+  // init shape, tensor dims
+  t_.shape = shape_;
+  for (DimType i = 0; i < t_.ndim; ++i) {
+    t_.shape[i] = dims[i];
+  }
+
+  // init strides, nullptr means the tensor is compact
+  t_.strides = nullptr;
+
+  // init byte_offset
+  t_.byte_offset = 0;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
new file mode 100644
index 0000000000..0c52bce1ef
--- /dev/null
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <dlpack/dlpack.h>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+class DLPackTensor {
+ public:
+  using LaneType = decltype(::DLTensor::dtype.lanes);  // uint16_t
+  using ShapeType =
+      std::remove_reference<decltype(::DLTensor::shape[0])>::type;  // int64_t
+
+  // lanes is only used in CPU to enable vectorization
+  explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1);
+
+  inline operator const ::DLTensor&() const { return t_; }
+
+  inline operator ::DLTensor&() { return t_; }
+
+ private:
+  ::DLTensor t_;
+
+  // The shape in DLTensor is defined as int64_t*
+  // Add this member to make TVMTensor init without heap allocation
+  ShapeType shape_[9];
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
new file mode 100644
index 0000000000..938b056350
--- /dev/null
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/dlpack_tensor.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+namespace {  // NOLINT
+template <typename T>
+constexpr uint8_t GetDLDataTypeCode() {
+  return std::is_same<platform::float16, T>::value ||
+                 std::is_floating_point<T>::value
+             ? static_cast<uint8_t>(kDLFloat)
+             : (std::is_unsigned<T>::value
+                    ? static_cast<uint8_t>(kDLUInt)
+                    : (std::is_integral<T>::value ? static_cast<uint8_t>(kDLInt)
+                                                  : static_cast<uint8_t>(-1)));
+}
+}  // NOLINT
+
+template <typename T>
+void TestMain(const platform::Place &place, uint16_t lanes) {
+  DDim dims{4, 5, 6, 7};
+  Tensor tensor;
+  tensor.Resize(dims);
+  void *p = tensor.mutable_data<T>(place);
+
+  DLPackTensor dlpack_tensor(tensor, lanes);
+  ::DLTensor &dl_tensor = dlpack_tensor;
+
+  CHECK_EQ(p, dl_tensor.data);
+  if (platform::is_cpu_place(place)) {
+    CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type);
+    CHECK_EQ(0, dl_tensor.ctx.device_id);
+  } else if (platform::is_gpu_place(place)) {
+    CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type);
+    CHECK_EQ(boost::get<platform::CUDAPlace>(place).device,
+             dl_tensor.ctx.device_id);
+  } else if (platform::is_cuda_pinned_place(place)) {
+    CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type);
+    CHECK_EQ(0, dl_tensor.ctx.device_id);
+  } else {
+    CHECK_EQ(false, true);
+  }
+
+  CHECK_EQ(dims.size(), dl_tensor.ndim);
+  for (auto i = 0; i < dims.size(); ++i) {
+    CHECK_EQ(dims[i], dl_tensor.shape[i]);
+  }
+
+  CHECK_EQ(dl_tensor.strides == nullptr, true);
+  CHECK_EQ(static_cast<uint64_t>(0), dl_tensor.byte_offset);
+
+  CHECK_EQ(lanes, dl_tensor.dtype.lanes);
+  CHECK_EQ(sizeof(T) * 8, dl_tensor.dtype.bits);
+
+  CHECK_EQ(GetDLDataTypeCode<T>(), dl_tensor.dtype.code);
+}
+
+template <typename T>
+void TestMainLoop() {
+#ifdef PADDLE_WITH_CUDA
+  std::vector<platform::Place> places{platform::CPUPlace(),
+                                      platform::CUDAPlace(0),
+                                      platform::CUDAPinnedPlace()};
+  if (platform::GetCUDADeviceCount() > 1) {
+    places.emplace_back(platform::CUDAPlace(1));
+  }
+#else
+  std::vector<platform::Place> places{platform::CPUPlace()};
+#endif
+  std::vector<uint16_t> lanes{1, 2};
+  for (auto &p : places) {
+    for (auto &l : lanes) {
+      TestMain<T>(p, l);
+    }
+  }
+}
+
+#define PADDLE_DLPACK_TEST(type) \
+  TEST(dlpack, test_##type) { TestMainLoop<type>(); }
+
+using float16 = platform::float16;
+PADDLE_DLPACK_TEST(float16);
+PADDLE_DLPACK_TEST(float);
+PADDLE_DLPACK_TEST(double);
+PADDLE_DLPACK_TEST(int);
+PADDLE_DLPACK_TEST(int64_t);
+PADDLE_DLPACK_TEST(bool);
+PADDLE_DLPACK_TEST(size_t);
+PADDLE_DLPACK_TEST(int16_t);
+PADDLE_DLPACK_TEST(uint8_t);
+PADDLE_DLPACK_TEST(int8_t);
+
+#undef PADDLE_DLPACK_TEST
+
+}  // namespace framework
+}  // namespace paddle

From 982e48922020e8d5f3ddcfc682068fcbdc5b7fe2 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 22 Nov 2018 06:26:04 +0000
Subject: [PATCH 07/36] test=develop

---
 python/paddle/fluid/layers/nn.py                   | 5 +++--
 python/paddle/fluid/tests/unittests/test_layers.py | 6 ++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 99acd7e308..32d411b830 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2139,8 +2139,9 @@ def pool2d(input,
                           input tensor is NCHW, where N is batch size, C is
                           the number of channels, H is the height of the
                           feature, and W is the width of the feature.
-        pool_size (int): The side length of pooling windows. All pooling
-                         windows are squares with pool_size on a side.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
         pool_type: ${pooling_type_comment}
         pool_stride (int): stride of the pooling layer.
         pool_padding (int): padding size.
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index a8fa5436c4..c4310fe006 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -202,6 +202,12 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.sequence_unpad(x=x, length=length))
         print(str(program))
 
+    def test_pool2d(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
+            self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3]))
+
     def test_lstm_unit(self):
         program = Program()
         with program_guard(program):

From 1adda8e06c075d55edcc6aa50804eab62b903f72 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Thu, 22 Nov 2018 06:53:16 +0000
Subject: [PATCH 08/36] Add more unit tests for split plugin test=develop

---
 .../inference/tensorrt/convert/split_op.cc    | 13 ++---
 .../tensorrt/convert/test_split_op.cc         | 47 ++++++++++++++++---
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 871354267e..ae5b1b9806 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -19,9 +19,6 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-/*
- * SplitOp.
- */
 class SplitOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
@@ -40,15 +37,11 @@ class SplitOpConverter : public OpConverter {
     int axis = boost::get<int>(op_desc.GetAttr("axis"));
     std::vector<int> output_lengths =
         boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
-    // PADDLE_ENFORCE(axis != 0);
-    if (axis < 0) {
-      axis += input_dims.nbDims;
-    } else {
-      axis -= 1;
-    }
+    // split on batch is not supported in TensorRT
+    PADDLE_ENFORCE(axis != 0);
+    axis += (axis < 0) ? input_dims.nbDims : -1;
 
     PADDLE_ENFORCE(output_lengths.size() == output_num);
-    //
     plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
     nvinfer1::IPluginLayer* layer =
         engine_->AddPlugin(&input, input_num, plugin);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
index 23909378dd..5aacc5c600 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -59,21 +59,54 @@ void TensorRTSplitTest(const std::vector<int> &in_shape,
   validator.Execute(BatchSize);
 }
 
-TEST(split_op, test_same_shape_batch1) {
+// batch = 0, axis = 1, same shape
+TEST(split_op, test_same_shape_axis1_batch1) {
   TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2});
 }
-
-TEST(split_op, test_different_shape_batch1) {
+// batch = 0, axis = 1, different shape
+TEST(split_op, test_different_shape_axis1_batch1) {
   TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1});
 }
-
-TEST(split_op, test_same_shape_batch10) {
+// batch = 10, axis = 1, same shape
+TEST(split_op, test_same_shape_axis1_batch10) {
   TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2});
 }
-
-TEST(split_op, test_different_shape_batch10) {
+// batch = 10, axis = 1, different shape
+TEST(split_op, test_different_shape_axis1_batch10) {
   TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1});
 }
+// batch = 0, axis = 2, same shape
+TEST(split_op, test_same_shape_axis2_batch1) {
+  TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2});
+}
+// batch = 0, axis = 2, different shape
+TEST(split_op, test_different_shape_axis2_batch1) {
+  TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1});
+}
+// batch = 10, axis = 2, same shape
+TEST(split_op, test_same_shape_axis2_batch10) {
+  TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2});
+}
+// batch = 10, axis = 2, different shape
+TEST(split_op, test_different_shape_axis2_batch10) {
+  TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1});
+}
+// batch = 0, axis = 3, same shape
+TEST(split_op, test_same_shape_axis3_batch1) {
+  TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2});
+}
+// batch = 0, axis = 3, different shape
+TEST(split_op, test_different_shape_axis3_batch1) {
+  TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1});
+}
+// batch = 10, axis = 3, same shape
+TEST(split_op, test_same_shape_axis3_batch10) {
+  TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2});
+}
+// batch = 10, axis = 3, different shape
+TEST(split_op, test_different_shape_axis3_batch10) {
+  TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1});
+}
 
 }  // namespace tensorrt
 }  // namespace inference

From 510601b2793047858763032b7816af07ab2b2bc7 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 22 Nov 2018 09:01:08 +0000
Subject: [PATCH 09/36] test=develop

---
 python/paddle/fluid/layers/nn.py                   | 10 +++++++---
 python/paddle/fluid/tests/unittests/test_layers.py |  7 ++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 32d411b830..27f83a60bd 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2139,12 +2139,16 @@ def pool2d(input,
                           input tensor is NCHW, where N is batch size, C is
                           the number of channels, H is the height of the
                           feature, and W is the width of the feature.
-        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple,
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
         pool_type: ${pooling_type_comment}
-        pool_stride (int): stride of the pooling layer.
-        pool_padding (int): padding size.
+        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
+            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
+            Otherwise, the pool padding size will be a square of an int.
         global_pooling (bool): ${global_pooling_comment}
         use_cudnn (bool): ${use_cudnn_comment}
         ceil_mode (bool): ${ceil_mode_comment}
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index c4310fe006..559c9cda48 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -206,7 +206,12 @@ class TestBook(unittest.TestCase):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
-            self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3]))
+            self.assertIsNotNone(
+                layers.pool2d(
+                    x,
+                    pool_size=[5, 3],
+                    pool_stride=[1, 2],
+                    pool_padding=(2, 1)))
 
     def test_lstm_unit(self):
         program = Program()

From 83370576cd8f35e4155d94a789c886c8c264056d Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 22 Nov 2018 18:52:54 +0800
Subject: [PATCH 10/36] Add sqlite3 support in Python3.6

test=develop
---
 tools/manylinux1/build_scripts/build_utils.sh | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
index d97745ad2d..48cce15a14 100755
--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -50,6 +50,15 @@ function do_cpython_build {
     mkdir -p ${prefix}/lib
     # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
 
+    if [ $(lex_pyver $py_ver) -eq $(lex_pyver 3.6) ]; then
+        wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
+        tar -zxf sqlite-autoconf-3250300.tar.gz
+        cd sqlite-autoconf-3250300
+        ./configure --prefix=/usr/local
+        make -j8 && make install
+        cd ../ && rm sqlite-autoconf-3250300.tar.gz
+    fi
+
     # NOTE --enable-shared for generating libpython shared library needed for
     # linking of some of the nupic.core test executables.
     if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
@@ -59,9 +68,9 @@ function do_cpython_build {
         make -j8 > /dev/null
         make altinstall > /dev/null
     else
-        CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
-        make -j8 > /dev/null
-        make install > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make -j8 > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make install > /dev/null
     fi
     popd
     echo "ZZZ looking for libpython"

From 00b9e9a1357bb3fa6e6adceb4e650d9f6424aa2a Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Thu, 22 Nov 2018 20:40:56 +0800
Subject: [PATCH 11/36] Refine cublas to support CUBLAS_TENSOR_OP_MATH (#13929)

* refine cublase
test=develop

* code refine

* refine cublas

* add GEMME_EX

* add enable_cublas_tensor_op_math doc and add cublasCall
test=develop

* fix CublasCall for cuda version
test=develop

* fix error
test=develop

* fix GEMM_EX to be compatible with gcc 4.8
test=develop

* add GEMM_EX
test=develop

* to compatiable with gcc4.8
test=develop
---
 paddle/fluid/operators/math/blas_impl.cu.h | 206 +++++++++++++++++----
 paddle/fluid/platform/device_context.h     |  47 +++++
 paddle/fluid/platform/dynload/cublas.h     |  26 ++-
 paddle/fluid/platform/gpu_info.cc          |  20 ++
 paddle/fluid/platform/gpu_info.h           |   3 +
 python/paddle/fluid/__init__.py            |   3 +-
 6 files changed, 256 insertions(+), 49 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index d84c88cb3b..d35073029a 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -16,6 +16,9 @@
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+DECLARE_bool(enable_cublas_tensor_op_math);
 
 namespace paddle {
 namespace operators {
@@ -42,11 +45,44 @@ struct CUBlas<float> {
   }
 
   template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
+  static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...));
 #else
     PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(platform::CUDADeviceContext *dev_ctx,
+                      cublasOperation_t transa, cublasOperation_t transb, int m,
+                      int n, int k, const float *alpha, const void *A,
+                      cudaDataType_t Atype, int lda, const void *B,
+                      cudaDataType_t Btype, int ldb, const float *beta, void *C,
+                      cudaDataType_t Ctype, int ldc) {
+    // Because the gcc 4.8 doesn't expand template parameter pack that
+    // appears in a lambda-expression, I can not use template parameter pack
+    // here.
+    auto cublas_call = [&]() {
+#if CUDA_VERSION >= 8000
+      VLOG(5) << "use_tensor_op_math: "
+              << (platform::TensorCoreAvailable() ? "True" : "False");
+      PADDLE_ENFORCE(platform::dynload::cublasSgemmEx(
+          dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
+          lda, B, Btype, ldb, beta, C, Ctype, ldc));
+#else
+      PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
+#endif
+    };
+
+#if CUDA_VERSION >= 9000
+    // NOTES: To use Tensor Core, we should change the cublas config,
+    // but the cublas may be hold by multi-thread.
+    dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+#else
+    cublas_call();
 #endif
   }
 };
@@ -69,13 +105,18 @@ struct CUBlas<double> {
   }
 
   template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
+  static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...));
 #else
     PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
 #endif
   }
+
+  template <typename... ARGS>
+  static void GEMM_EX(ARGS... args) {
+    PADDLE_THROW("Currently there are not cublasDgemmEx.");
+  }
 };
 
 template <>
@@ -96,14 +137,16 @@ struct CUBlas<platform::float16> {
                                        reinterpret_cast<__half *>(C), ldc));
   }
 
-  static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa,
-                         cublasOperation_t transb, int m, int n, int k,
-                         const float16 *alpha, const float16 *A, int lda,
-                         long long int strideA, const float16 *B,  // NOLINT
-                         int ldb, long long int strideB,           // NOLINT
-                         const float16 *beta, float16 *C, int ldc,
-                         long long int strideC,  // NOLINT
-                         int batchCount) {
+  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb, int m, int n, int k,
+                                 const float16 *alpha, const float16 *A,
+                                 int lda, long long int strideA,  // NOLINT
+                                 const float16 *B,                // NOLINT
+                                 int ldb, long long int strideB,  // NOLINT
+                                 const float16 *beta, float16 *C, int ldc,
+                                 long long int strideC,  // NOLINT
+                                 int batchCount) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
         handle, transa, transb, m, n, k,
@@ -114,6 +157,45 @@ struct CUBlas<platform::float16> {
         ldc, strideC, batchCount));
 #else
     PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(platform::CUDADeviceContext *dev_ctx,
+                      cublasOperation_t transa, cublasOperation_t transb, int m,
+                      int n, int k, const void *alpha, const void *A,
+                      cudaDataType_t Atype, int lda, const void *B,
+                      cudaDataType_t Btype, int ldb, const void *beta, void *C,
+                      cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+    auto cublas_call = [&]() {
+#if CUDA_VERSION >= 8000
+      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+      bool use_tensor_op_math = platform::TensorCoreAvailable();
+      if (use_tensor_op_math) {
+        algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+      }
+      VLOG(5) << "use_tensor_op_math: "
+              << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+      PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+          dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
+          lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo));
+#else
+      PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
+#endif
+    };
+
+#if CUDA_VERSION >= 9000
+    // NOTES: To use Tensor Core, we should change the cublas config,
+    // but the cublas may be hold by multi-thread.
+    dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+#else
+    cublas_call();
 #endif
   }
 };
@@ -133,8 +215,21 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
-  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
-                  B, ldb, A, lda, &beta, C, N);
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, N);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
+                    &alpha, B, ldb, A, lda, &beta, C, N);
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
 }
 
 template <>
@@ -157,30 +252,18 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
   PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
                     "cublas fp16 gemm requires GPU compute capability >= 53");
 
-#if CUDA_VERSION >= 8000
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
 
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-  if (context_.GetComputeCapability() >= 70) {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
-        context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH));
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
-        context_.cublas_handle(), CUBLAS_DEFAULT_MATH));
-  }
-#endif  // CUDA_VERSION >= 9000
-
+#if CUDA_VERSION >= 8000
   // cublasHgemm does true FP16 computation which is slow for non-Volta
   // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
   // input/output in fp16, computation in fp32, which can also be accelerated
   // using tensor cores in volta GPUs.
-  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
-      context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
-      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
-      CUDA_R_32F, algo));
+  auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+  CUBlas<platform::float16>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A,
+      CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
   CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
@@ -199,8 +282,38 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
   // the cblas convention.
   cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
-                  B, ldb, A, lda, &beta, C, ldc);
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, ldc);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
+                    &alpha, B, ldb, A, lda, &beta, C, ldc);
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    bool transA, bool transB, int M, int N, int K, platform::float16 alpha,
+    const platform::float16 *A, int lda, const platform::float16 *B, int ldb,
+    platform::float16 beta, platform::float16 *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &alpha, B, ldb, A, lda, &beta, C,
+                                  ldc);
 }
 
 template <>
@@ -238,9 +351,34 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int64_t strideC = M * N;
 
-  CUBlas<T>::GEMM_BATCH(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
-                        &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc,
-                        strideC, batchCount);
+#if CUDA_VERSION >= 9010
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto cublas_call = [&]() {
+      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+      bool use_tensor_op_math = platform::TensorCoreAvailable();
+      if (use_tensor_op_math) {
+        algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+      }
+      VLOG(5) << "use_tensor_op_math: "
+              << (use_tensor_op_math ? "True" : "False");
+
+      PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx(
+          context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B,
+          CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C,
+          CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo));
+    };
+    auto &dev_ctx = const_cast<platform::CUDADeviceContext &>(context_);
+    dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+  } else {
+#endif  // CUDA_VERSION >= 9010
+
+    CUBlas<T>::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &alpha, B, ldb, strideB, A, lda,
+                                  strideA, &beta, C, ldc, strideC, batchCount);
+
+#if CUDA_VERSION >= 9010
+  }
+#endif  // CUDA_VERSION >= 9010
 }
 
 }  // namespace math
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 9a9018cdea..3edd727978 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -143,6 +143,39 @@ class CudnnWorkspaceHandle {
   std::unique_ptr<std::lock_guard<std::mutex>> guard_;
 };
 
+#if CUDA_VERSION >= 9000
+class ScopedCublasMathMode {
+ public:
+  ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode)
+      : handle_(handle) {
+    need_reset = false;
+    PADDLE_ENFORCE(
+        platform::dynload::cublasGetMathMode(handle_, &old_math_mode_),
+        "Failed to get old cublas math mode");
+    if (old_math_mode_ != new_math_mode) {
+      PADDLE_ENFORCE(
+          platform::dynload::cublasSetMathMode(handle_, new_math_mode),
+          "Failed to set old cublas math mode");
+      need_reset = true;
+    }
+  }
+
+  ~ScopedCublasMathMode() {
+    if (need_reset) {
+      PADDLE_ENFORCE(
+          platform::dynload::cublasSetMathMode(handle_, old_math_mode_),
+          "Failed to set old cublas math mode");
+    }
+  }
+
+ private:
+  cublasHandle_t handle_;
+  cublasMath_t old_math_mode_;
+  bool need_reset;
+};
+
+#endif
+
 class CUDADeviceContext : public DeviceContext {
  public:
   explicit CUDADeviceContext(CUDAPlace place);
@@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext {
     callback_manager_->Wait();
   }
 
+#if CUDA_VERSION >= 9000
+  /*! \brief CublasCall may need to change cublas's config,
+   *  but the cublas may be hold by multi-thread, so we should
+   *  add lock here. */
+  template <typename Callback>
+  void CublasCall(Callback callback, cublasMath_t new_math) {
+    std::lock_guard<std::mutex> guard(cublas_mtx_);
+    ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math);
+    callback();
+  }
+#endif
+
  private:
   CUDAPlace place_;
 
@@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext {
   // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
   mutable std::mutex callback_mtx_;
   std::unique_ptr<StreamCallbackManager> callback_manager_;
+
+  mutable std::mutex cublas_mtx_;
 };
 
 template <>
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 4ea0cd7283..ff80bd525c 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -61,9 +61,6 @@ extern void *cublas_dso_handle;
   extern DynLoad__##__name __name
 #endif
 
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
-  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSaxpy_v2);                \
   __macro(cublasDaxpy_v2);                \
@@ -93,22 +90,23 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
 // APIs available after CUDA 8.0
 #if CUDA_VERSION >= 8000
-#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
-  __macro(cublasGemmEx);                     \
-  __macro(cublasSgemmStridedBatched);        \
-  __macro(cublasDgemmStridedBatched);        \
-  __macro(cublasCgemmStridedBatched);        \
-  __macro(cublasZgemmStridedBatched);        \
-  __macro(cublasHgemmStridedBatched);
-
-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched);
 #endif
 
 // APIs available after CUDA 9.0
 #if CUDA_VERSION >= 9000
-#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode);
+#endif
 
-CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#if CUDA_VERSION >= 9010
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx);
 #endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index c78f159ad2..833d48347f 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -26,6 +26,16 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
               "additional trunks of the same size will be requested from gpu "
               "until the gpu has no memory left for another trunk.");
 
+DEFINE_bool(
+    enable_cublas_tensor_op_math, false,
+    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
+    "but it may loss precision. Currently, There are two CUDA libraries that"
+    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
+    " GEMM computations(the matrices must be either half precision or single "
+    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
+    "input and output must be half precision) and recurrent neural networks "
+    "(RNNs).");
+
 namespace paddle {
 namespace platform {
 
@@ -64,6 +74,16 @@ int GetCUDADriverVersion(int id) {
   return driver_version;
 }
 
+bool TensorCoreAvailable() {
+#if CUDA_VERSION >= 9000
+  int device = GetCurrentDeviceId();
+  int driver_version = GetCUDAComputeCapability(device);
+  return driver_version >= 70;
+#else
+  return false;
+#endif
+}
+
 int GetCUDAMultiProcessors(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int count;
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index be44158431..6a0b3c8e02 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id);
 //! Get the driver version of the ith GPU
 int GetCUDADriverVersion(int id);
 
+//! Wheter the current device support TensorCore
+bool TensorCoreAvailable();
+
 //! Get the MultiProcessors of the ith GPU.
 int GetCUDAMultiProcessors(int i);
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 543acf2d34..3c092dee34 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -133,7 +133,8 @@ def __bootstrap__():
     if core.is_compiled_with_cuda():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
-            'conv_workspace_size_limit', 'cudnn_exhaustive_search'
+            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
+            'cudnn_exhaustive_search'
         ]
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])

From 6cc6bf4074d69c5c0b02af612b94e438d596803a Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Thu, 22 Nov 2018 15:30:43 +0100
Subject: [PATCH 12/36] Bumped MKL-DNN version to 0.17

test=develop
---
 cmake/external/mkldnn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 785148d4f9..b280db23b9 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -53,7 +53,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "21fb5f2af1dd14e132af4f1b79160977ee487818"
+    GIT_TAG             "830a10059a018cd2634d94195140cf2d8790a75a"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

From a902b8b0f811f6837330385b95fa2f552393197c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 23 Nov 2018 01:07:58 +0800
Subject: [PATCH 13/36] Add sqlite3 support

test=develop
---
 tools/manylinux1/Dockerfile.x64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index e91216a5b8..48fd145e5f 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && \
-  bash build_scripts/install_nccl2.sh && rm -r build_scripts
+  bash build_scripts/install_nccl2.sh && rm -rf build_scripts
 
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 

From 9ea1ce63192fee1a211aa5dcc6fecf4758434451 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 23 Nov 2018 11:51:07 +0800
Subject: [PATCH 14/36] Update issue templates

---
 .github/ISSUE_TEMPLATE/---feature-request-.md | 27 +++++++++++++
 .github/ISSUE_TEMPLATE/---inference-issue-.md | 40 +++++++++++++++++++
 .../ISSUE_TEMPLATE/---installation-issue-.md  | 40 +++++++++++++++++++
 .github/ISSUE_TEMPLATE/---model-issue-.md     | 36 +++++++++++++++++
 .github/ISSUE_TEMPLATE/---others-.md          | 33 +++++++++++++++
 .github/ISSUE_TEMPLATE/---training-issue-.md  | 38 ++++++++++++++++++
 6 files changed, 214 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/---feature-request-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---inference-issue-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---installation-issue-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---model-issue-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---others-.md
 create mode 100644 .github/ISSUE_TEMPLATE/---training-issue-.md

diff --git a/.github/ISSUE_TEMPLATE/---feature-request-.md b/.github/ISSUE_TEMPLATE/---feature-request-.md
new file mode 100644
index 0000000000..57708855dc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---feature-request-.md
@@ -0,0 +1,27 @@
+---
+name: 建议(Feature request)
+about: 您可以提出您的建议。 You could use this template for reporting a suggestion  issue.
+
+---
+
+欢迎您对PaddlePaddle提出建议，非常感谢您对PaddlePaddle的贡献！
+在留下您的建议时，辛苦您同步提供如下信息：
+- 版本、环境信息
+1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1
+2）CPU/GPU：您是否使用GPU进行训练，如是，请提供您的CUDA和cuDNN版本号
+3）系统环境：请您描述系统类型、版本，例如Mac OS 10.14
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 建议描述：请您详细描述，您认为需优化的功能
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+Please make sure that this is a feature request. 
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe the feature and the current behavior/state.**
+**Any Other info.**
diff --git a/.github/ISSUE_TEMPLATE/---inference-issue-.md b/.github/ISSUE_TEMPLATE/---inference-issue-.md
new file mode 100644
index 0000000000..37bdc8889e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---inference-issue-.md
@@ -0,0 +1,40 @@
+---
+name: 预测（Inference Issue）
+about: 您可以提问预测中报错、应用等问题。 You could use this template for reporting an inference issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准描述您的问题，例如“最新预测库的API文档在哪儿 ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号（如1.1）或CommitID
+    2）CPU：预测若用CPU，请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库使用情况
+    3）GPU：预测若用GPU，请提供GPU型号、CUDA和CUDNN版本号
+    4）系统环境：请您描述系统类型、版本（如Mac OS 10.14），Python版本
+-预测信息
+    1）C++预测：请您提供预测库安装包的版本信息，及其中的version.txt文件
+    2）CMake包含路径的完整命令
+    3）API信息（如调用请提供）
+    4）预测库来源：官网下载/特殊环境（如BCLOUD编译）
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that th
+If there is no solution,please make sure that this is an inference issue including the following details :
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Python version
+-Cmake orders
+-C++version.txt
+-API information
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---installation-issue-.md b/.github/ISSUE_TEMPLATE/---installation-issue-.md
new file mode 100644
index 0000000000..ce4ba58932
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---installation-issue-.md
@@ -0,0 +1,40 @@
+---
+name: 安装（Installation Issue）
+about: 您可以提问安装、编译出现报错等问题。 You could use this template for reporting an installation
+   issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+建立issue时，为快速解决问题，请您根据使用情况给出如下信息：
+- 标题：请包含关键词“安装错误”/“编译错误”，例如“Mac编译错误”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号（如1.1）或CommitID
+    2）CPU：请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库的使用情况
+    3）GPU：请提供GPU型号，CUDA和CUDNN版本号
+    4）系统环境：请说明系统类型、版本（如Mac OS 10.14）、Python版本
+- 安装方式信息：
+1）pip安装/docker安装
+2）本地编译：请提供cmake命令，编译命令
+3）docker编译：请提供docker镜像，编译命令            
+  特殊环境请注明：如离线安装等
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in Github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is an installation issue including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg. Mac OS 10.14)
+-Python version
+- Install method: pip install/install with docker/build from source(without docker)/build within docker
+- Other special cases that you think may be related to this problem, eg. offline install, special internet condition   
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---model-issue-.md b/.github/ISSUE_TEMPLATE/---model-issue-.md
new file mode 100644
index 0000000000..7cb52f37b9
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---model-issue-.md
@@ -0,0 +1,36 @@
+---
+name: 模型（Model Issue）
+about: 您可以提问模型、算法、数据集方向的使用报错等问题。You could use this template for reporting a model/
+  algorithm/dataset  issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issue前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+建立issue时，为快速解决问题，请您根据使用情况给出如下信息：
+- 标题：简洁、精准描述您的问题，例如“ssd 模型前置lstm报错  ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU：请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库的使用情况
+    3）GPU：请提供GPU型号，CUDA和CUDNN版本号
+    4）系统环境：请说明系统类型、版本（例如Mac OS 10.14），Python版本
+- 模型信息
+    1）模型名称 2）使用数据集名称 3）使用算法名称 4）模型链接
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github.Probably there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is a issue of models including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Python version
+-Name of Models&Dataset/details of operator
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---others-.md b/.github/ISSUE_TEMPLATE/---others-.md
new file mode 100644
index 0000000000..6a291153e4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---others-.md
@@ -0,0 +1,33 @@
+---
+name: 其他（Others）
+about: 如上述分类未包含您的问题，可在此提出。 You could use this template for reporting other issues
+
+---
+
+为使您的问题得到快速解决，在建立Issues前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准概括您的问题
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU/GPU：如果您使用GPU训练，请提供GPU驱动版本、CUDA和cuDNN版本号
+    3）系统环境：请您描述系统类型、版本，例如Mac OS 10.14
+    4）Python版本号
+    5）显存信息
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志/代码关键片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please provide us with the following details :
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/cuDNN version
+-OS Platform and Distribution(eg.Mac OS 10.14)
+-Python version 
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**
diff --git a/.github/ISSUE_TEMPLATE/---training-issue-.md b/.github/ISSUE_TEMPLATE/---training-issue-.md
new file mode 100644
index 0000000000..29e8383d97
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---training-issue-.md
@@ -0,0 +1,38 @@
+---
+name: 训练（Training issue）
+about: 您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training
+   issue.
+
+---
+
+为使您的问题得到快速解决，在建立Issues前，请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
+
+如果您没有查询到相似问题，为快速解决您的提问，建立issue时请提供如下细节信息：
+- 标题：简洁、精准概括您的问题，例如“Insufficient Memory xxx" ”
+- 版本、环境信息：
+    1）PaddlePaddle版本：请提供您的PaddlePaddle版本号，例如1.1或CommitID
+    2）CPU：预测若用CPU，请提供CPU型号，MKL/OpenBlas/MKLDNN/等数学库使用情况
+    3）GPU：预测若用GPU，请提供GPU型号、CUDA和CUDNN版本号
+    4）系统环境：请您描述系统类型、版本，例如Mac OS 10.14，Python版本
+- 训练信息
+    1）单机/多机，单卡/多卡
+    2）显存信息
+    3）Operator信息
+- 复现信息：如为报错，请给出复现环境、复现步骤
+- 问题描述：请详细描述您的问题，同步贴出报错信息、日志、可复现的代码片段
+
+Thank you for contributing to PaddlePaddle.
+Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
+If there is no solution,please make sure that this is a training issue including the following details:
+**System information**
+-PaddlePaddle version （eg.1.1）or CommitID
+-CPU: including CPUMKL/OpenBlas/MKLDNN version
+-GPU: including CUDA/CUDNN version
+-OS Platform (eg.Mac OS 10.14)
+-Other imformation: Distriuted training/informantion of operator/
+Graphics card storage
+**To Reproduce**
+Steps to reproduce the behavior
+**Describe your current behavior**
+**Code to reproduce the issue**
+**Other info / logs**

From 36f08eef3b466001f339e2c33f47dac60bbc6821 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Fri, 23 Nov 2018 13:04:41 +0800
Subject: [PATCH 15/36] CUDA kernel for density_prior_box_op. (#14513)

* CUDA kernel for density_prior_box_op.
* Support flatten to 2D.
---
 paddle/fluid/API.spec                         |   2 +-
 paddle/fluid/framework/op_desc.cc             |   6 +
 .../fluid/operators/detection/CMakeLists.txt  |   2 +-
 .../detection/density_prior_box_op.cc         |  36 ++--
 .../detection/density_prior_box_op.cu         | 170 ++++++++++++++++++
 .../detection/density_prior_box_op.h          |  73 ++++----
 python/paddle/fluid/layers/detection.py       |  43 +++--
 python/paddle/fluid/tests/test_detection.py   |  60 ++++---
 .../unittests/test_density_prior_box_op.py    |  30 ++--
 9 files changed, 305 insertions(+), 117 deletions(-)
 create mode 100644 paddle/fluid/operators/detection/density_prior_box_op.cu

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 541c4db1fa..50114bf3df 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
 paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
-paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None))
+paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None))
 paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
 paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index fbaa169df6..362cda3f23 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
         this->attrs_[name] = std::vector<int>();
         break;
       }
+      case proto::AttrType::LONGS: {
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from LONGS to LONGS";
+        this->attrs_[name] = std::vector<int64_t>();
+        break;
+      }
       case proto::AttrType::FLOATS: {
         VLOG(110) << "SetAttr: " << Type() << ", " << name
                   << " from INTS to FLOATS";
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 58f6f48467..6c85f1577e 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -22,7 +22,7 @@ iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
-detection_library(density_prior_box_op SRCS density_prior_box_op.cc)
+detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
 anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc
index 99df15c322..1012ba3652 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -39,24 +39,27 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
     auto fixed_sizes = ctx->Attrs().Get<std::vector<float>>("fixed_sizes");
     auto fixed_ratios = ctx->Attrs().Get<std::vector<float>>("fixed_ratios");
     auto densities = ctx->Attrs().Get<std::vector<int>>("densities");
+    bool flatten = ctx->Attrs().Get<bool>("flatten_to_2d");
 
     PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(),
                       "The number of fixed_sizes and densities must be equal.");
     size_t num_priors = 0;
-    if ((fixed_sizes.size() > 0) && (densities.size() > 0)) {
-      for (size_t i = 0; i < densities.size(); ++i) {
-        if (fixed_ratios.size() > 0) {
-          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-        }
-      }
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+    }
+    if (!flatten) {
+      std::vector<int64_t> dim_vec(4);
+      dim_vec[0] = input_dims[2];
+      dim_vec[1] = input_dims[3];
+      dim_vec[2] = num_priors;
+      dim_vec[3] = 4;
+      ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+      ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+    } else {
+      int64_t dim0 = input_dims[2] * input_dims[3] * num_priors;
+      ctx->SetOutputDim("Boxes", {dim0, 4});
+      ctx->SetOutputDim("Variances", {dim0, 4});
     }
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_priors;
-    dim_vec[3] = 4;
-    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
-    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
   }
 
  protected:
@@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        platform::CPUPlace());
+        ctx.GetPlace());
   }
 };
 
@@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
         .SetDefault(true);
-
+    AddAttr<bool>("flatten_to_2d",
+                  "(bool) Whether to flatten to 2D and "
+                  "the second dim is 4.")
+        .SetDefault(false);
     AddAttr<float>(
         "step_w",
         "Density prior boxes step across width, 0.0 for auto calculation.")
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
new file mode 100644
index 0000000000..3b7c781795
--- /dev/null
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -0,0 +1,170 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/density_prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+static __device__ inline T Clip(T in) {
+  return min(max(in, 0.), 1.);
+}
+
+template <typename T>
+static __global__ void GenDensityPriorBox(
+    const int height, const int width, const int im_height, const int im_width,
+    const T offset, const T step_width, const T step_height,
+    const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin,
+    const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) {
+  int gidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int gidy = blockIdx.y * blockDim.y + threadIdx.y;
+  int step_x = blockDim.x * gridDim.x;
+  int step_y = blockDim.y * gridDim.y;
+
+  const T* width_ratio = ratios_shift;
+  const T* height_ratio = ratios_shift + num_priors;
+  const T* width_shift = ratios_shift + 2 * num_priors;
+  const T* height_shift = ratios_shift + 3 * num_priors;
+
+  for (int j = gidy; j < height; j += step_y) {
+    for (int i = gidx; i < width * num_priors; i += step_x) {
+      int h = j;
+      int w = i / num_priors;
+      int k = i % num_priors;
+
+      T center_x = (w + offset) * step_width;
+      T center_y = (h + offset) * step_height;
+
+      T center_x_temp = center_x + width_shift[k];
+      T center_y_temp = center_y + height_shift[k];
+
+      T box_width_ratio = width_ratio[k] / 2.;
+      T box_height_ratio = height_ratio[k] / 2.;
+
+      T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.);
+      T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.);
+      T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.);
+      T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.);
+
+      int out_offset = (j * width * num_priors + i) * 4;
+      out[out_offset] = is_clip ? Clip<T>(xmin) : xmin;
+      out[out_offset + 1] = is_clip ? Clip<T>(ymin) : ymin;
+      out[out_offset + 2] = is_clip ? Clip<T>(xmax) : xmax;
+      out[out_offset + 3] = is_clip ? Clip<T>(ymax) : ymax;
+
+      var[out_offset] = var_xmin;
+      var[out_offset + 1] = var_ymin;
+      var[out_offset + 2] = var_xmax;
+      var[out_offset + 3] = var_ymax;
+    }
+  }
+}
+
+template <typename T>
+class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto is_clip = ctx.Attr<bool>("clip");
+
+    auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
+    auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
+    auto densities = ctx.Attr<std::vector<int>>("densities");
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = 0;
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+    }
+    int step_average = static_cast<int>((step_width + step_height) * 0.5);
+
+    framework::Tensor h_temp;
+    T* tdata = h_temp.mutable_data<T>({num_priors * 4}, platform::CPUPlace());
+    int idx = 0;
+    for (size_t s = 0; s < fixed_sizes.size(); ++s) {
+      auto fixed_size = fixed_sizes[s];
+      int density = densities[s];
+      for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+        float ar = fixed_ratios[r];
+        int shift = step_average / density;
+        float box_width_ratio = fixed_size * sqrt(ar);
+        float box_height_ratio = fixed_size / sqrt(ar);
+        for (int di = 0; di < density; ++di) {
+          for (int dj = 0; dj < density; ++dj) {
+            float center_x_temp = shift / 2. + dj * shift - step_average / 2.;
+            float center_y_temp = shift / 2. + di * shift - step_average / 2.;
+            tdata[idx] = box_width_ratio;
+            tdata[num_priors + idx] = box_height_ratio;
+            tdata[2 * num_priors + idx] = center_x_temp;
+            tdata[3 * num_priors + idx] = center_y_temp;
+            idx++;
+          }
+        }
+      }
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor d_temp;
+    framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp);
+
+    // At least use 32 threads, at most 512 threads.
+    // blockx is multiple of 32.
+    int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L);
+    int gridx = (feature_width * num_priors + blockx - 1) / blockx;
+    dim3 threads(blockx, 1);
+    dim3 grids(gridx, feature_height);
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+    GenDensityPriorBox<T><<<grids, threads, 0, stream>>>(
+        feature_height, feature_width, img_height, img_width, offset,
+        step_width, step_height, num_priors, d_temp.data<T>(), is_clip,
+        variances[0], variances[1], variances[2], variances[3],
+        boxes->data<T>(), vars->data<T>());
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(density_prior_box,
+                        ops::DensityPriorBoxOpCUDAKernel<float>,
+                        ops::DensityPriorBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
index 9a52077e9c..ed2f5df80c 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
       step_height = step_h;
     }
     int num_priors = 0;
-    if (fixed_sizes.size() > 0 && densities.size() > 0) {
-      for (size_t i = 0; i < densities.size(); ++i) {
-        if (fixed_ratios.size() > 0) {
-          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-        }
-      }
+    for (size_t i = 0; i < densities.size(); ++i) {
+      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
     }
 
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
 
+    auto box_dim = vars->dims();
+    boxes->Resize({feature_height, feature_width, num_priors, 4});
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
     int step_average = static_cast<int>((step_width + step_height) * 0.5);
 
     for (int h = 0; h < feature_height; ++h) {
@@ -76,36 +74,34 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
           auto fixed_size = fixed_sizes[s];
           int density = densities[s];
           // Generate density prior boxes with fixed ratios.
-          if (fixed_ratios.size() > 0) {
-            for (size_t r = 0; r < fixed_ratios.size(); ++r) {
-              float ar = fixed_ratios[r];
-              int shift = step_average / density;
-              float box_width_ratio = fixed_size * sqrt(ar);
-              float box_height_ratio = fixed_size / sqrt(ar);
-              for (int di = 0; di < density; ++di) {
-                for (int dj = 0; dj < density; ++dj) {
-                  float center_x_temp =
-                      center_x - step_average / 2. + shift / 2. + dj * shift;
-                  float center_y_temp =
-                      center_y - step_average / 2. + shift / 2. + di * shift;
-                  e_boxes(h, w, idx, 0) =
-                      (center_x_temp - box_width_ratio / 2.) / img_width >= 0
-                          ? (center_x_temp - box_width_ratio / 2.) / img_width
-                          : 0;
-                  e_boxes(h, w, idx, 1) =
-                      (center_y_temp - box_height_ratio / 2.) / img_height >= 0
-                          ? (center_y_temp - box_height_ratio / 2.) / img_height
-                          : 0;
-                  e_boxes(h, w, idx, 2) =
-                      (center_x_temp + box_width_ratio / 2.) / img_width <= 1
-                          ? (center_x_temp + box_width_ratio / 2.) / img_width
-                          : 1;
-                  e_boxes(h, w, idx, 3) =
-                      (center_y_temp + box_height_ratio / 2.) / img_height <= 1
-                          ? (center_y_temp + box_height_ratio / 2.) / img_height
-                          : 1;
-                  idx++;
-                }
+          for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+            float ar = fixed_ratios[r];
+            int shift = step_average / density;
+            float box_width_ratio = fixed_size * sqrt(ar);
+            float box_height_ratio = fixed_size / sqrt(ar);
+            for (int di = 0; di < density; ++di) {
+              for (int dj = 0; dj < density; ++dj) {
+                float center_x_temp =
+                    center_x - step_average / 2. + shift / 2. + dj * shift;
+                float center_y_temp =
+                    center_y - step_average / 2. + shift / 2. + di * shift;
+                e_boxes(h, w, idx, 0) =
+                    (center_x_temp - box_width_ratio / 2.) / img_width >= 0
+                        ? (center_x_temp - box_width_ratio / 2.) / img_width
+                        : 0;
+                e_boxes(h, w, idx, 1) =
+                    (center_y_temp - box_height_ratio / 2.) / img_height >= 0
+                        ? (center_y_temp - box_height_ratio / 2.) / img_height
+                        : 0;
+                e_boxes(h, w, idx, 2) =
+                    (center_x_temp + box_width_ratio / 2.) / img_width <= 1
+                        ? (center_x_temp + box_width_ratio / 2.) / img_width
+                        : 1;
+                e_boxes(h, w, idx, 3) =
+                    (center_y_temp + box_height_ratio / 2.) / img_height <= 1
+                        ? (center_y_temp + box_height_ratio / 2.) / img_height
+                        : 1;
+                idx++;
               }
             }
           }
@@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
     e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
 
     vars->Resize(var_dim);
+    boxes->Resize(box_dim);
   }
 };  // namespace operators
 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3f17400a14..4843af8340 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -1029,6 +1029,7 @@ def density_prior_box(input,
                       clip=False,
                       steps=[0.0, 0.0],
                       offset=0.5,
+                      flatten_to_2d=False,
                       name=None):
     """
     **Density Prior Box Operator**
@@ -1065,22 +1066,24 @@ def density_prior_box(input,
             height/weight of the input will be automatically calculated.
             Default: [0., 0.]
        offset(float): Prior boxes center offset. Default: 0.5
+       flatten_to_2d(bool): Whether to flatten output prior boxes and variance
+           to 2D shape, the second dim is 4. Default: False.
        name(str): Name of the density prior box op. Default: None.
 
     Returns:
         tuple: A tuple with two Variable (boxes, variances)
 
         boxes: the output density prior boxes of PriorBox.
-        The layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input,
-        num_priors is the total
-        box count of each position of input.
+            The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
+            The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
+            H is the height of input, W is the width of input,
+            num_priors is the total box count of each position of input.
 
         variances: the expanded variances of PriorBox.
-        The layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input
-        num_priors is the total
-        box count of each position of input
+            The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
+            The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
+            H is the height of input, W is the width of input
+            num_priors is the total box count of each position of input.
 
 
     Examples:
@@ -1089,14 +1092,11 @@ def density_prior_box(input,
             box, var = fluid.layers.density_prior_box(
                 input=conv1,
                 image=images,
-                min_sizes=[100.],
-                max_sizes=[200.],
-                aspect_ratios=[1.0, 1.0 / 2.0, 2.0],
-                densities=[3, 4],
-                fixed_sizes=[50., 60.],
-                fixed_ratios=[1.0, 3.0, 1.0 / 3.0],
-                flip=True,
-                clip=True)
+                densities=[4, 2, 1],
+                fixed_sizes=[32.0, 64.0, 128.0],
+                fixed_ratios=[1.],
+                clip=True,
+                flatten_to_2d=True)
     """
     helper = LayerHelper("density_prior_box", **locals())
     dtype = helper.input_dtype()
@@ -1127,14 +1127,11 @@ def density_prior_box(input,
         'step_w': steps[0],
         'step_h': steps[1],
         'offset': offset,
+        'densities': densities,
+        'fixed_sizes': fixed_sizes,
+        'fixed_ratios': fixed_ratios,
+        'flatten_to_2d': flatten_to_2d,
     }
-    if densities is not None and len(densities) > 0:
-        attrs['densities'] = densities
-    if fixed_sizes is not None and len(fixed_sizes) > 0:
-        attrs['fixed_sizes'] = fixed_sizes
-    if fixed_ratios is not None and len(fixed_ratios) > 0:
-        attrs['fixed_ratios'] = fixed_ratios
-
     box = helper.create_variable_for_type_inference(dtype)
     var = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 982d291801..a2eca5541a 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -112,38 +112,42 @@ class TestDetection(unittest.TestCase):
 
 class TestPriorBox(unittest.TestCase):
     def test_prior_box(self):
-        data_shape = [3, 224, 224]
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-        box, var = layers.prior_box(
-            input=conv1,
-            image=images,
-            min_sizes=[100.0],
-            aspect_ratios=[1.],
-            flip=True,
-            clip=True)
-        assert len(box.shape) == 4
-        assert box.shape == var.shape
-        assert box.shape[3] == 4
+        program = Program()
+        with program_guard(program):
+            data_shape = [3, 224, 224]
+            images = fluid.layers.data(
+                name='pixel', shape=data_shape, dtype='float32')
+            conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+            box, var = layers.prior_box(
+                input=conv1,
+                image=images,
+                min_sizes=[100.0],
+                aspect_ratios=[1.],
+                flip=True,
+                clip=True)
+            assert len(box.shape) == 4
+            assert box.shape == var.shape
+            assert box.shape[3] == 4
 
 
 class TestDensityPriorBox(unittest.TestCase):
     def test_density_prior_box(self):
-        data_shape = [3, 224, 224]
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-        box, var = layers.density_prior_box(
-            input=conv1,
-            image=images,
-            densities=[3, 4],
-            fixed_sizes=[50., 60.],
-            fixed_ratios=[1.0],
-            clip=True)
-        assert len(box.shape) == 4
-        assert box.shape == var.shape
-        assert box.shape[3] == 4
+        program = Program()
+        with program_guard(program):
+            data_shape = [3, 224, 224]
+            images = fluid.layers.data(
+                name='pixel', shape=data_shape, dtype='float32')
+            conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+            box, var = layers.density_prior_box(
+                input=conv1,
+                image=images,
+                densities=[3, 4],
+                fixed_sizes=[50., 60.],
+                fixed_ratios=[1.0],
+                clip=True)
+            assert len(box.shape) == 4
+            assert box.shape == var.shape
+            assert box.shape[-1] == 4
 
 
 class TestAnchorGenerator(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
index 79d1fd3d71..4b0bc1dcf8 100644
--- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
@@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest):
             'offset': self.offset,
             'densities': self.densities,
             'fixed_sizes': self.fixed_sizes,
-            'fixed_ratios': self.fixed_ratios
+            'fixed_ratios': self.fixed_ratios,
+            'flatten_to_2d': self.flatten_to_2d
         }
         self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
 
@@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest):
         self.set_data()
 
     def set_density(self):
-        self.densities = []
-        self.fixed_sizes = []
-        self.fixed_ratios = []
+        self.densities = [4, 2, 1]
+        self.fixed_sizes = [32.0, 64.0, 128.0]
+        self.fixed_ratios = [1.0]
+        self.layer_w = 17
+        self.layer_h = 17
+        self.image_w = 533
+        self.image_h = 533
+        self.flatten_to_2d = False
 
     def init_test_params(self):
-        self.layer_w = 32
-        self.layer_h = 32
-
-        self.image_w = 40
-        self.image_h = 40
+        self.set_density()
 
         self.step_w = float(self.image_w) / float(self.layer_w)
         self.step_h = float(self.image_h) / float(self.layer_h)
@@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest):
         self.variances = [0.1, 0.1, 0.2, 0.2]
         self.variances = np.array(self.variances, dtype=np.float).flatten()
 
-        self.set_density()
-
         self.clip = True
         self.num_priors = 0
         if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
@@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest):
                           (self.layer_h, self.layer_w, self.num_priors, 1))
         self.out_boxes = out_boxes.astype('float32')
         self.out_var = out_var.astype('float32')
+        if self.flatten_to_2d:
+            self.out_boxes = self.out_boxes.reshape((-1, 4))
+            self.out_var = self.out_var.reshape((-1, 4))
 
 
 class TestDensityPriorBox(TestDensityPriorBoxOp):
@@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp):
         self.densities = [3, 4]
         self.fixed_sizes = [1.0, 2.0]
         self.fixed_ratios = [1.0]
+        self.layer_w = 32
+        self.layer_h = 32
+        self.image_w = 40
+        self.image_h = 40
+        self.flatten_to_2d = True
 
 
 if __name__ == '__main__':

From 61c5f13fcf92c18f30c05a90e3d3badd884f9340 Mon Sep 17 00:00:00 2001
From: sabreshao <sabre.shao@amd.com>
Date: Fri, 23 Nov 2018 14:27:39 +0800
Subject: [PATCH 16/36] Fix cmake for AMDGPU platform (#13801)

* HIP cmake.
Enable whole archieve build for pybind library.

Disable two warning.

Rollback to C++11.

Link RCCL to WA gpu kernel loading issue.

Update eigen to fix build failure.

Add more include directories.

Fix O3 build failure.

Update eigen.

fix tensor_util_test segment fault issue

add more macro check in hip.cmake.
we may consider refine hip.cmake to inherit all add_definitions() in parrent scope, in the future.

Fix rocRAND load.

Update eigen to fix gru_unit_op and reduce_op.

Add HIP support to testing.

Update eigen to support int16 and int8 in arg min and arg max.

* add rocprim as cub library used by nv implementation

* Reduce build time in rocprim.

* Add rocprim introduction, remove useless cmake code.

* Remove useless flags and format cmake file.
---
 CMakeLists.txt                      |  1 +
 cmake/external/eigen.cmake          |  2 +-
 cmake/external/rocprim.cmake        | 44 +++++++++++++++++++++++++++++
 cmake/flags.cmake                   |  3 ++
 cmake/generic.cmake                 | 26 +++++++++--------
 cmake/hip.cmake                     | 32 +++++++++++++++++----
 paddle/fluid/pybind/CMakeLists.txt  |  4 +--
 paddle/testing/paddle_gtest_main.cc |  2 +-
 8 files changed, 94 insertions(+), 20 deletions(-)
 create mode 100644 cmake/external/rocprim.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3059ab7e0e..8dcf9786e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,6 +204,7 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
+include(external/rocprim)
 include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 573ad5e5f0..6aef97f212 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -17,7 +17,7 @@ if(WITH_AMD_GPU)
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+        GIT_TAG         7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
         PREFIX          ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake
new file mode 100644
index 0000000000..914c064918
--- /dev/null
+++ b/cmake/external/rocprim.cmake
@@ -0,0 +1,44 @@
+if (NOT WITH_AMD_GPU)
+    return()
+endif()
+
+# rocprim is "ROCm Parallel Primitives" for short.
+# It is a header-only library providing HIP and HC parallel primitives
+# for developing performant GPU-accelerated code on AMD ROCm platform.
+
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+
+INCLUDE(ExternalProject)
+
+SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim)
+SET(ROCPRIM_INSTALL_DIR  ${THIRD_PARTY_PATH}/install/rocprim)
+SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include)
+
+ExternalProject_Add(
+    extern_rocprim
+    GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
+    GIT_TAG        5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc 
+    PREFIX         ${ROCPRIM_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS     -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc
+    CMAKE_ARGS     -DONLY_INSTALL=ON
+    CMAKE_ARGS     -DBUILD_TEST=OFF
+    CMAKE_ARGS     -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR}
+
+    INSTALL_DIR    ${ROCPRIM_INSTALL_DIR}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+)
+
+INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR})
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";")
+    add_library(rocprim STATIC ${dummyfile})
+else()
+    add_library(rocprim INTERFACE)
+endif()
+
+add_dependencies(rocprim extern_rocprim)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 343e44ab4b..c4472040ce 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -129,6 +129,9 @@ set(COMMON_FLAGS
     -Wno-error=parentheses-equality # Warnings in pybind11
     -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
     -Wno-error=terminate  # Warning in PADDLE_ENFORCE
+    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
+    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
+    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
 )
 
 set(GPU_COMMON_FLAGS
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 111627a932..7d803d00ef 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -454,25 +454,29 @@ function(hip_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
         set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
-        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
-	find_fluid_modules(${TARGET_NAME})
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so)
+        find_fluid_modules(${TARGET_NAME})
       endif()
-      if (hip_library_DEPS)
-	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
-	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
+        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+        target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+        list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END)
+      else()
+        target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
       endif()
       # cpplint code style
       foreach(source_file ${hip_library_SRCS})
-	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
-	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-	endif()
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
       endforeach()
     else(hip_library_SRCS)
       if (hip_library_DEPS)
-	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+        merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
       else()
-	message(FATAL "Please specify source file or library in nv_library.")
+        message(FATAL "Please specify source file or library in nv_library.")
       endif()
     endif(hip_library_SRCS)
   endif()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index bfe491bd6b..4276bc5b08 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU)
 endif()
 
 include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hip/include")
+include_directories("/opt/rocm/miopen/include")
 include_directories("/opt/rocm/hipblas/include")
 include_directories("/opt/rocm/hiprand/include")
 include_directories("/opt/rocm/rocrand/include")
@@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust")
 
 list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
 
-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
 
 if(WITH_DSO)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
 endif(WITH_DSO)
 
-if(WITH_DOUBLE)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
-endif(WITH_DOUBLE)
-
 if(WITH_TESTING)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
 endif(WITH_TESTING)
 
+if(WITH_DISTRIBUTE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
+endif(WITH_DISTRIBUTE)
+
+if(WITH_GRPC)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
+endif(WITH_GRPC)
+
+if(NOT WITH_GOLANG)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
+endif(NOT WITH_GOLANG)
+
+if(WITH_MKLDNN)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
+endif(WITH_MKLDNN)
+
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
+
+if(NOT WITH_RDMA)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
+endif(NOT WITH_RDMA)
+
+
+
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
     list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index fb6ee2f4a5..25d241d976 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -5,8 +5,8 @@ if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS ${PYBIND_SRCS}
-      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+      DEPS ARCHIVE_START ${PYBIND_DEPS}
+      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END)
   else()
     cc_library(paddle_pybind SHARED
       SRCS ${PYBIND_SRCS}
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 598f435461..babb862122 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -28,7 +28,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < argc; ++i) {
     new_argv.push_back(argv[i]);
   }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
 #else

From e21edb26f6e7fb364597c31a26f128c3c2710516 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 22 Nov 2018 17:53:13 +0800
Subject: [PATCH 17/36] add Set/GetCPUNumThreads api

---
 paddle/fluid/inference/api/analysis_config.cc            | 1 +
 paddle/fluid/inference/api/analysis_predictor.cc         | 3 +--
 paddle/fluid/inference/api/api_impl.cc                   | 3 +--
 paddle/fluid/inference/api/paddle_api.h                  | 9 +++++++++
 .../inference/tests/api/analyzer_resnet50_tester.cc      | 1 +
 paddle/fluid/inference/tests/api/config_printer.h        | 2 ++
 paddle/fluid/inference/tests/api/tester_helper.h         | 1 +
 paddle/fluid/operators/math/fc_compute.h                 | 4 +---
 paddle/fluid/platform/cpu_helper.cc                      | 2 +-
 9 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 5ccd2dc5ab..100ee0c9d3 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   prog_file = other.prog_file;
   param_file = other.param_file;
   specify_input_name = other.specify_input_name;
+  cpu_num_threads_ = other.cpu_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
   use_feed_fetch_ops = other.use_feed_fetch_ops;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index cb14d2a260..9162ccefd8 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -35,7 +35,6 @@
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(profile);
-DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 
@@ -67,7 +66,7 @@ bool AnalysisPredictor::Init(
 #endif
 
   // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+  paddle::platform::SetNumThreads(config_.GetCPUNumThreads());
 
   if (!PrepareScope(parent_scope)) {
     return false;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index fcbc3803d0..c3d17edea4 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
-DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace {
@@ -76,7 +75,7 @@ bool NativePaddlePredictor::Init(
 #endif
 
   // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+  paddle::platform::SetNumThreads(config_.GetCPUNumThreads());
 
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 0a2a2a1a23..b7f7781d06 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -186,6 +186,15 @@ struct NativeConfig : public PaddlePredictor::Config {
   // Specify the variable's name of each input if input tensors don't follow the
   // `feeds` and `fetches` of the phase `save_inference_model`.
   bool specify_input_name{false};
+
+  // Set and get the number of cpu threads.
+  void SetCPUNumThreads(int cpu_num_threads) {
+    cpu_num_threads_ = cpu_num_threads;
+  }
+  int GetCPUNumThreads() const { return cpu_num_threads_; }
+
+ protected:
+  int cpu_num_threads_{1};  // number of cpu threads for each instance.
 };
 
 // A factory to help create different predictors.
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 2b936175ed..308a794ca3 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->device = 0;
   cfg->enable_ir_optim = true;
   cfg->specify_input_name = true;
+  cfg->SetCPUNumThreads(FLAGS_paddle_num_threads);
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index aa0c4b1d04..a803f5b3f4 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
   os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
   os << GenSpaces(num_spaces)
      << "specify_input_name: " << config.specify_input_name << "\n";
+  os << GenSpaces(num_spaces)
+     << "cpu_num_threads: " << config.GetCPUNumThreads() << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
   return os;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 7b686045a5..fdadd59049 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true,
             "Running the inference program in analysis mode.");
 
 DECLARE_bool(profile);
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
index b072b4c20a..5b9953a5aa 100644
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/jit_kernel.h"
 
-DECLARE_int32(paddle_num_threads);
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -43,7 +41,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
                            .template Get<jitkernel::VAddKernel<T>>(N);
 
 #ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#pragma omp parallel for
 #endif
     for (int i = 0; i < M; i++) {
       T* dst = Y + i * N;
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index f2d691b293..b737a6c38d 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) {
 #elif defined(PADDLE_WITH_MKLML)
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
-  omp_set_num_threads(num_threads);
+  omp_set_num_threads(real_num_threads);
 #else
   PADDLE_ENFORCE(false, "To be implemented.");
 #endif

From a5c4b463c962bed48fba89d459adf82f4899d6c3 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 22 Nov 2018 18:37:33 +0800
Subject: [PATCH 18/36] add SetMKLDNNThreadId api

---
 paddle/fluid/inference/api/analysis_predictor.cc    | 8 ++++++++
 paddle/fluid/inference/api/analysis_predictor.h     | 2 ++
 paddle/fluid/inference/api/paddle_analysis_config.h | 2 +-
 paddle/fluid/inference/tests/api/tester_helper.h    | 9 ++++++---
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 9162ccefd8..4633a75e5e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -159,6 +159,14 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
+void AnalysisPredictor::SetMKLDNNThreadId(int tid) {
+#ifdef PADDLE_WITH_MKLDNN
+  platform::set_cur_thread_id(tid);
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
+#endif
+}
+
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                             std::vector<PaddleTensor> *output_data,
                             int batch_size) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index cf81b7db73..9191970a3a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor {
   framework::Scope *scope() { return scope_.get(); }
   framework::ProgramDesc &program() { return *inference_program_; }
 
+  void SetMKLDNNThreadId(int tid);
+
  protected:
   bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
   bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 2ac736df7c..a09bd1cac2 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig {
                             int max_batch_size = 1);
   bool use_tensorrt() const { return use_tensorrt_; }
 
+  void EnableMKLDNN();
   // NOTE this is just for internal development, please not use it.
   // NOT stable yet.
-  void EnableMKLDNN();
   bool use_mkldnn() const { return use_mkldnn_; }
 
   friend class ::paddle::AnalysisPredictor;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index fdadd59049..72703bc80b 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -216,13 +216,16 @@ void TestMultiThreadPrediction(
   size_t total_time{0};
   for (int tid = 0; tid < num_threads; ++tid) {
     threads.emplace_back([&, tid]() {
-#ifdef PADDLE_WITH_MKLDNN
-      platform::set_cur_thread_id(static_cast<int>(tid) + 1);
-#endif
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
       std::vector<PaddleTensor> outputs_tid;
       auto &predictor = predictors[tid];
+#ifdef PADDLE_WITH_MKLDNN
+      if (use_analysis) {
+        static_cast<AnalysisPredictor *>(predictor.get())
+            ->SetMKLDNNThreadId(static_cast<int>(tid) + 1);
+      }
+#endif
 
       // warmup run
       LOG(INFO) << "Running thread " << tid << ", warm up run...";

From e66b4c6bff74231898cbbb013627b0eb86eced0f Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 22 Nov 2018 18:49:59 +0800
Subject: [PATCH 19/36] adjust tester_helper to make multi-instance
 multi-thread work

test=develop
---
 paddle/fluid/inference/tests/api/tester_helper.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 72703bc80b..d21567ac19 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -207,11 +207,7 @@ void TestMultiThreadPrediction(
   int batch_size = FLAGS_batch_size;
   int num_times = FLAGS_repeat;
   std::vector<std::thread> threads;
-  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
-  for (int tid = 1; tid < num_threads; ++tid) {
-    predictors.emplace_back(predictors.front()->Clone());
-  }
+  auto main_predictor = CreateTestPredictor(config, use_analysis);
 
   size_t total_time{0};
   for (int tid = 0; tid < num_threads; ++tid) {
@@ -219,7 +215,9 @@ void TestMultiThreadPrediction(
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
       std::vector<PaddleTensor> outputs_tid;
-      auto &predictor = predictors[tid];
+      // To ensure the thread binding correctly,
+      // please clone inside the threadpool.
+      auto predictor = main_predictor->Clone();
 #ifdef PADDLE_WITH_MKLDNN
       if (use_analysis) {
         static_cast<AnalysisPredictor *>(predictor.get())

From 116979a40adf7fe7788f8cd50b9f03c57bcbba7b Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 23 Nov 2018 16:17:56 +0800
Subject: [PATCH 20/36] refine api name

test=develop
---
 paddle/fluid/inference/api/analysis_config.cc      |  3 ++-
 paddle/fluid/inference/api/analysis_predictor.cc   |  4 ++--
 paddle/fluid/inference/api/analysis_predictor.h    |  2 +-
 paddle/fluid/inference/api/api_impl.cc             |  2 +-
 paddle/fluid/inference/api/paddle_api.h            | 14 +++++++++-----
 .../tests/api/analyzer_resnet50_tester.cc          |  2 +-
 paddle/fluid/inference/tests/api/config_printer.h  |  2 +-
 paddle/fluid/inference/tests/api/tester_helper.h   |  2 +-
 8 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 100ee0c9d3..dd75f0d9a6 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -46,7 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   prog_file = other.prog_file;
   param_file = other.param_file;
   specify_input_name = other.specify_input_name;
-  cpu_num_threads_ = other.cpu_num_threads_;
+  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
   use_feed_fetch_ops = other.use_feed_fetch_ops;
@@ -73,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
   prog_file = other.prog_file;
   param_file = other.param_file;
   specify_input_name = other.specify_input_name;
+  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
   use_feed_fetch_ops = other.use_feed_fetch_ops;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4633a75e5e..c132ce326c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -66,7 +66,7 @@ bool AnalysisPredictor::Init(
 #endif
 
   // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(config_.GetCPUNumThreads());
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 
   if (!PrepareScope(parent_scope)) {
     return false;
@@ -159,7 +159,7 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
-void AnalysisPredictor::SetMKLDNNThreadId(int tid) {
+void AnalysisPredictor::SetMkldnnThreadID(int tid) {
 #ifdef PADDLE_WITH_MKLDNN
   platform::set_cur_thread_id(tid);
 #else
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 9191970a3a..db57812bc3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -69,7 +69,7 @@ class AnalysisPredictor : public PaddlePredictor {
   framework::Scope *scope() { return scope_.get(); }
   framework::ProgramDesc &program() { return *inference_program_; }
 
-  void SetMKLDNNThreadId(int tid);
+  void SetMkldnnThreadID(int tid);
 
  protected:
   bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index c3d17edea4..66a8e51396 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -75,7 +75,7 @@ bool NativePaddlePredictor::Init(
 #endif
 
   // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(config_.GetCPUNumThreads());
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index b7f7781d06..1513a4b3b4 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -187,14 +187,18 @@ struct NativeConfig : public PaddlePredictor::Config {
   // `feeds` and `fetches` of the phase `save_inference_model`.
   bool specify_input_name{false};
 
-  // Set and get the number of cpu threads.
-  void SetCPUNumThreads(int cpu_num_threads) {
-    cpu_num_threads_ = cpu_num_threads;
+  // Set and get the number of cpu math library threads.
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
+    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+  }
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
   }
-  int GetCPUNumThreads() const { return cpu_num_threads_; }
 
  protected:
-  int cpu_num_threads_{1};  // number of cpu threads for each instance.
+  // number of cpu math library (such as MKL, OpenBlas) threads for each
+  // instance.
+  int cpu_math_library_num_threads_{1};
 };
 
 // A factory to help create different predictors.
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 308a794ca3..abc63577b7 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -27,7 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->device = 0;
   cfg->enable_ir_optim = true;
   cfg->specify_input_name = true;
-  cfg->SetCPUNumThreads(FLAGS_paddle_num_threads);
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index a803f5b3f4..4231eef722 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -54,7 +54,7 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
   os << GenSpaces(num_spaces)
      << "specify_input_name: " << config.specify_input_name << "\n";
   os << GenSpaces(num_spaces)
-     << "cpu_num_threads: " << config.GetCPUNumThreads() << "\n";
+     << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
   return os;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index d21567ac19..1dc1678406 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -221,7 +221,7 @@ void TestMultiThreadPrediction(
 #ifdef PADDLE_WITH_MKLDNN
       if (use_analysis) {
         static_cast<AnalysisPredictor *>(predictor.get())
-            ->SetMKLDNNThreadId(static_cast<int>(tid) + 1);
+            ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
       }
 #endif
 

From c35bf3d34b43dd6cc5b96e963f8990d60a68d749 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 23 Nov 2018 16:36:54 +0800
Subject: [PATCH 21/36] Fix multiclass_nms_op unit test fail in python3.6

test=develop
---
 .../fluid/tests/unittests/test_multiclass_nms_op.py       | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index df0562dcc7..e35be54b63 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -145,10 +145,16 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
         lod.append(nmsed_num)
         if nmsed_num == 0: continue
 
+        tmp_det_out = []
         for c, indices in nmsed_outs.items():
             for idx in indices:
                 xmin, ymin, xmax, ymax = boxes[n][idx][:]
                 det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
+                tmp_det_out.append(
+                    [c, scores[n][c][idx], xmin, ymin, xmax, ymax])
+        sorted_det_out = sorted(
+            tmp_det_out, key=lambda tup: tup[0], reverse=False)
+        det_outs.extend(sorted_det_out)
 
     return det_outs, lod
 
@@ -210,7 +216,7 @@ class TestMulticlassNMSOp(OpTest):
 class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
     def set_argument(self):
         # Here set 2.0 to test the case there is no outputs.
-        # In practical use, 0.0 < score_threshold < 1.0 
+        # In practical use, 0.0 < score_threshold < 1.0
         self.score_threshold = 2.0
 
 

From 5ca56cad1f3fdb50f7d019ae5e658b538f98aecc Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 23 Nov 2018 08:54:09 +0000
Subject: [PATCH 22/36] test=develop

---
 python/paddle/fluid/layers/control_flow.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 9730fbf510..05138bf945 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -896,9 +896,10 @@ def array_to_lod_tensor(x, table):
 
 def increment(x, value=1.0, in_place=True):
     """
-    This function performs an operation that increments each value in the
+    This function performs an operation that increments the value in the
     input :math:`x` by an amount: :math:`value` as mentioned in the input
-    parameter. This operation is performed in-place by default.
+    parameter. This operation is performed in-place by default. Notice that
+    the number of elements in :math:`x` must be equal to 1.
 
     Args:
         x (Variable|list): The tensor that has the input values.
@@ -911,7 +912,8 @@ def increment(x, value=1.0, in_place=True):
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          data = fluid.layers.data(name='data', shape=[1], dtype='float32',
+                                   append_batch_size=False)
           data = fluid.layers.increment(x=data, value=3.0, in_place=True)
     """
     helper = LayerHelper("increment", **locals())

From f7847ca6a304a649982c04bff4f3eec846a06c5d Mon Sep 17 00:00:00 2001
From: chengduozh <zhaochengduo@baidu.com>
Date: Fri, 23 Nov 2018 17:09:14 +0800
Subject: [PATCH 23/36] fix cublas warp error test=develop

---
 paddle/fluid/platform/dynload/cublas.cc |  3 +++
 paddle/fluid/platform/dynload/cublas.h  | 30 ++++++++++++++++---------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc
index 361d3439b8..41648c32fe 100644
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
 CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
 #endif
 
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
+CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index ff80bd525c..ced789b90d 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -90,23 +90,33 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
 // APIs available after CUDA 8.0
 #if CUDA_VERSION >= 8000
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched);
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(cublasGemmEx);                     \
+  __macro(cublasSgemmStridedBatched);        \
+  __macro(cublasDgemmStridedBatched);        \
+  __macro(cublasCgemmStridedBatched);        \
+  __macro(cublasZgemmStridedBatched);        \
+  __macro(cublasHgemmStridedBatched);
+
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 
 // APIs available after CUDA 9.0
 #if CUDA_VERSION >= 9000
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode);
+#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
+  __macro(cublasSetMathMode);                \
+  __macro(cublasGetMathMode);
+
+CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 
+// APIs available after CUDA 9.1
 #if CUDA_VERSION >= 9010
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx);
-DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx);
+#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
+  __macro(cublasGemmBatchedEx);              \
+  __macro(cublasGemmStridedBatchedEx);
+
+CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP

From 5431d5c471d32a5ea3be049a339e57262bd3b483 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 23 Nov 2018 18:15:06 +0800
Subject: [PATCH 24/36] Polish code

test=develop
---
 python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index e35be54b63..9778bd694d 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -149,7 +149,6 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
         for c, indices in nmsed_outs.items():
             for idx in indices:
                 xmin, ymin, xmax, ymax = boxes[n][idx][:]
-                det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
                 tmp_det_out.append(
                     [c, scores[n][c][idx], xmin, ymin, xmax, ymax])
         sorted_det_out = sorted(

From 3d100b0c927b6326e75b3e493d545ee2b0ff4f4b Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 23 Nov 2018 19:14:46 +0800
Subject: [PATCH 25/36] Add Python3.6 Python3.7 compile process

test=develop
---
 Dockerfile | 75 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b36102175c..eb7bb2549e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -33,44 +33,49 @@ RUN apt-get update && \
     automake locales clang-format swig cmake  \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
+    build-essential checkinstall \
+    libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
     net-tools libtool ccache && \
     apt-get clean -y
 
-# Install Go and glide
-RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-# install glide
-RUN curl -s -q https://glide.sh/get | sh
-
-# Install TensorRT
-# following TensorRT.tar.gz is not the default official one, we do two miny changes:
-# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
-#    and its size is only one-third of the official one.
-# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
-#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
-    tar -xz -C /usr/local && \
-    cp -rf /usr/local/TensorRT/include /usr && \
-    cp -rf /usr/local/TensorRT/lib /usr
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
-# version util jupyter fixes this issue.
-
-# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# version(1.7.1 for now), which causes building documentation failed.
+COPY tools/manylinux1/build_scripts/* /root/python/
+RUN cd /root/python/ && source build_utils && MY_DIR=/root/python/ build_cpythons 3.6.0 3.7.0
+
+# # Install Go and glide
+# RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    # tar -xz -C /usr/local && \
+    # mkdir /root/gopath && \
+    # mkdir /root/gopath/bin && \
+    # mkdir /root/gopath/src
+# ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+# ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# # install glide
+# RUN curl -s -q https://glide.sh/get | sh
+
+# # Install TensorRT
+# # following TensorRT.tar.gz is not the default official one, we do two miny changes:
+# # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
+# #    and its size is only one-third of the official one.
+# # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
+# #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
+# RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+    # tar -xz -C /usr/local && \
+    # cp -rf /usr/local/TensorRT/include /usr && \
+    # cp -rf /usr/local/TensorRT/lib /usr
+
+# # git credential to skip password typing
+# RUN git config --global credential.helper store
+
+# # Fix locales to en_US.UTF-8
+# RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+# # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
+# # version util jupyter fixes this issue.
+
+# # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
+# # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
+# # version(1.7.1 for now), which causes building documentation failed.
 # RUN pip3 install -U wheel && \
     # pip3 install -U docopt PyYAML sphinx==1.5.6 && \
     # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \

From 64ca3d176cc1348f0735e3e6f4fd2c18e902f43b Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Fri, 23 Nov 2018 19:18:20 +0800
Subject: [PATCH 26/36] Add bias_attr in sequence_conv_pool API. (#14553)

---
 paddle/fluid/API.spec       | 2 +-
 python/paddle/fluid/nets.py | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 50114bf3df..8397ae093b 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va
 paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ 
 paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
-paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
+paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None))
 paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
 paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
 paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 00d33b36fc..fb75ef62d0 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -250,7 +250,8 @@ def sequence_conv_pool(input,
                        filter_size,
                        param_attr=None,
                        act="sigmoid",
-                       pool_type="max"):
+                       pool_type="max",
+                       bias_attr=None):
     """
     The sequence_conv_pool is composed with Sequence Convolution and Pooling.
 
@@ -266,6 +267,11 @@ def sequence_conv_pool(input,
         pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
             average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
             Default :math:`max`.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, sequence_conv
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
 
     Return:
         Variable: The final result after Sequence Convolution and Pooling.
@@ -289,6 +295,7 @@ def sequence_conv_pool(input,
         num_filters=num_filters,
         filter_size=filter_size,
         param_attr=param_attr,
+        bias_attr=bias_attr,
         act=act)
 
     pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)

From e8a8f2626cc65b8dc3a91507eb233581e8e7e0e2 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sat, 24 Nov 2018 16:58:57 +0800
Subject: [PATCH 27/36] Add Python3.6 and Python3.7 support in Ubuntu
 Dockerfile

test=develop
---
 Dockerfile | 200 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 118 insertions(+), 82 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index eb7bb2549e..6f45c79f3a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,6 +22,27 @@ ENV HOME /root
 # Add bash enhancements
 COPY ./paddle/scripts/docker/root/ /root/
 
+# Prepare packages for Python
+RUN apt-get update && \
+    apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+    libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+    xz-utils tk-dev libffi-dev liblzma-dev
+
+# Install Python3.6
+RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
+    tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
+    ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
+    wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
+    tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.6 --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null
+
+# Install Python3.7
+RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
+    tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7 --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null
+
 RUN apt-get update && \
     apt-get install -y --allow-downgrades patchelf \
     python3 python3-dev python3-pip \
@@ -34,88 +55,103 @@ RUN apt-get update && \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     build-essential checkinstall \
-    libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
+    libreadline-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
     net-tools libtool ccache && \
     apt-get clean -y
 
-COPY tools/manylinux1/build_scripts/* /root/python/
-RUN cd /root/python/ && source build_utils && MY_DIR=/root/python/ build_cpythons 3.6.0 3.7.0
-
-# # Install Go and glide
-# RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    # tar -xz -C /usr/local && \
-    # mkdir /root/gopath && \
-    # mkdir /root/gopath/bin && \
-    # mkdir /root/gopath/src
-# ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-# ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-# # install glide
-# RUN curl -s -q https://glide.sh/get | sh
-
-# # Install TensorRT
-# # following TensorRT.tar.gz is not the default official one, we do two miny changes:
-# # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
-# #    and its size is only one-third of the official one.
-# # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
-# #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-# RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
-    # tar -xz -C /usr/local && \
-    # cp -rf /usr/local/TensorRT/include /usr && \
-    # cp -rf /usr/local/TensorRT/lib /usr
-
-# # git credential to skip password typing
-# RUN git config --global credential.helper store
-
-# # Fix locales to en_US.UTF-8
-# RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-# # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
-# # version util jupyter fixes this issue.
-
-# # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# # version(1.7.1 for now), which causes building documentation failed.
-# RUN pip3 install -U wheel && \
-    # pip3 install -U docopt PyYAML sphinx==1.5.6 && \
-    # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
-    # easy_install -U pip && \
-    # pip install -U pip setuptools wheel && \
-    # pip install -U docopt PyYAML sphinx==1.5.6 && \
-    # pip install sphinx-rtd-theme==0.1.9 recommonmark
-
-# RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    # pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    # pip3 install opencv-python && \
-    # pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    # pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    # pip install opencv-python
-
-# #For docstring checker
-# RUN pip3 install pylint pytest astroid isort
-# RUN pip install pylint pytest astroid isort LinkChecker
-
-# COPY ./python/requirements.txt /root/
-# RUN pip3 install -r /root/requirements.txt
-# RUN pip install -r /root/requirements.txt
-
-# # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
-# # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-# RUN apt-get install -y libssl-dev libffi-dev
-# RUN pip3 install certifi urllib3[secure]
-# RUN pip install certifi urllib3[secure]
-
-
-# # Install woboq_codebrowser to /woboq
-# RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    # (cd /woboq \
-     # cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           # -DCMAKE_BUILD_TYPE=Release . \
-     # make)
-
-# # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-# RUN mkdir /var/run/sshd
-# RUN echo 'root:root' | chpasswd
-# RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-# RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-# EXPOSE 22
+# Install Go and glide
+RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
+
+# Install TensorRT
+# following TensorRT.tar.gz is not the default official one, we do two miny changes:
+# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
+#    and its size is only one-third of the official one.
+# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
+#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
+RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+    tar -xz -C /usr/local && \
+    cp -rf /usr/local/TensorRT/include /usr && \
+    cp -rf /usr/local/TensorRT/lib /usr
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
+# version util jupyter fixes this issue.
+
+# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
+# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
+# version(1.7.1 for now), which causes building documentation failed.
+RUN pip3.5 install -U wheel && \
+    pip3.5 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.5 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.6 install -U wheel && \
+    pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.7 install -U wheel && \
+    pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    easy_install -U pip && \
+    pip install -U pip setuptools wheel && \
+    pip install -U docopt PyYAML sphinx==1.5.6 && \
+    pip install sphinx-rtd-theme==0.1.9 recommonmark
+
+RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.5 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.5 install opencv-python && \
+    pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.6 install opencv-python && \
+    pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.7 install opencv-python && \
+    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install opencv-python
+
+#For docstring checker
+RUN pip3.5 install pylint pytest astroid isort
+RUN pip3.6 install pylint pytest astroid isort
+RUN pip3.7 install pylint pytest astroid isort
+RUN pip install pylint pytest astroid isort LinkChecker
+
+COPY ./python/requirements.txt /root/
+RUN pip3.5 install -r /root/requirements.txt
+RUN pip3.6 install -r /root/requirements.txt
+RUN pip3.7 install -r /root/requirements.txt
+RUN pip install -r /root/requirements.txt
+
+# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
+# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
+RUN apt-get install -y libssl-dev libffi-dev
+RUN pip3.5 install certifi urllib3[secure]
+RUN pip3.6 install certifi urllib3[secure]
+RUN pip3.7 install certifi urllib3[secure]
+RUN pip install certifi urllib3[secure]
+
+
+# Install woboq_codebrowser to /woboq
+RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
+    (cd /woboq \
+     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+           -DCMAKE_BUILD_TYPE=Release . \
+     make)
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+EXPOSE 22

From 155a0f78e6f8688a68dae765dee4d25e08bc614b Mon Sep 17 00:00:00 2001
From: Min <minqiyang@baidu.com>
Date: Sat, 24 Nov 2018 17:17:14 +0800
Subject: [PATCH 28/36] Polish code

test=develop
---
 Dockerfile | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6f45c79f3a..9459552890 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -34,13 +34,13 @@ RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-a
     ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
     wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
     tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.6 --enable-shared > /dev/null && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null
 
 # Install Python3.7
 RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
     tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7 --enable-shared > /dev/null && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null
 
 RUN apt-get update && \
@@ -54,8 +54,6 @@ RUN apt-get update && \
     automake locales clang-format swig cmake  \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    build-essential checkinstall \
-    libreadline-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
     net-tools libtool ccache && \
     apt-get clean -y
 
@@ -94,9 +92,9 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3.5 install -U wheel && \
-    pip3.5 install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.5 install sphinx-rtd-theme==0.1.9 recommonmark && \
+RUN pip3 install -U wheel && \
+    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.6 install -U wheel && \
     pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
     pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
@@ -108,9 +106,9 @@ RUN pip3.5 install -U wheel && \
     pip install -U docopt PyYAML sphinx==1.5.6 && \
     pip install sphinx-rtd-theme==0.1.9 recommonmark
 
-RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.5 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.5 install opencv-python && \
+RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 install opencv-python && \
     pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip3.6 install opencv-python && \
@@ -122,13 +120,13 @@ RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip install opencv-python
 
 #For docstring checker
-RUN pip3.5 install pylint pytest astroid isort
+RUN pip3 install pylint pytest astroid isort
 RUN pip3.6 install pylint pytest astroid isort
 RUN pip3.7 install pylint pytest astroid isort
 RUN pip install pylint pytest astroid isort LinkChecker
 
 COPY ./python/requirements.txt /root/
-RUN pip3.5 install -r /root/requirements.txt
+RUN pip3 install -r /root/requirements.txt
 RUN pip3.6 install -r /root/requirements.txt
 RUN pip3.7 install -r /root/requirements.txt
 RUN pip install -r /root/requirements.txt
@@ -136,7 +134,7 @@ RUN pip install -r /root/requirements.txt
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
-RUN pip3.5 install certifi urllib3[secure]
+RUN pip3 install certifi urllib3[secure]
 RUN pip3.6 install certifi urllib3[secure]
 RUN pip3.7 install certifi urllib3[secure]
 RUN pip install certifi urllib3[secure]

From 05e6a7141717f1eb1e73836c7aec67faea4d4db5 Mon Sep 17 00:00:00 2001
From: Min <minqiyang@baidu.com>
Date: Sat, 24 Nov 2018 17:19:22 +0800
Subject: [PATCH 29/36] Polish code

test=develop
---
 .dockerignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.dockerignore b/.dockerignore
index 49adfe4f0a..2b2e74053d 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,6 +1,5 @@
 *.DS_Store
 build/
-build*
 *.user
 .vscode
 .idea

From c1bf9664cdf1187c3e01750ef68cfbbd8d788b7d Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Sun, 25 Nov 2018 14:11:23 +0800
Subject: [PATCH 30/36] Add options to disable SO_REUSEPORT of grpc. (#14269)

---
 .../operators/distributed/grpc_client.cc      |  5 +++++
 .../operators/distributed/grpc_server.cc      | 20 +++++++++++++++++++
 .../operators/distributed/sendrecvop_utils.cc |  2 ++
 python/paddle/fluid/__init__.py               |  1 +
 4 files changed, 28 insertions(+)

diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index c28f86146d..0bd76b3f6c 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DECLARE_bool(rpc_disable_reuse_port);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -383,6 +385,9 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
   // Channel configurations:
   grpc::ChannelArguments args;
   args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
+  if (FLAGS_rpc_disable_reuse_port) {
+    args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
+  }
   args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
   args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index ffd2b1707b..77bf67be25 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 
 using ::grpc::ServerAsyncResponseWriter;
 
+DECLARE_bool(rpc_disable_reuse_port);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -252,6 +254,20 @@ void AsyncGRPCServer::WaitServerReady() {
   VLOG(40) << "AsyncGRPCServer WaitSeverReady";
 }
 
+// Define an option subclass in order to disable SO_REUSEPORT for the
+// server socket.
+// Come from:
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+class NoReusePortOption : public ::grpc::ServerBuilderOption {
+ public:
+  void UpdateArguments(::grpc::ChannelArguments* args) override {
+    args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
+  }
+
+  void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
+                         plugins) override {}
+};
+
 void AsyncGRPCServer::StartServer() {
   ::grpc::ServerBuilder builder;
   builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
@@ -259,6 +275,10 @@ void AsyncGRPCServer::StartServer() {
 
   builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+  if (FLAGS_rpc_disable_reuse_port) {
+    builder.SetOption(
+        std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
+  }
   builder.RegisterService(&service_);
 
   for (auto t : rpc_call_map_) {
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 374fa680e3..df5af3476f 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 
+DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 3c092dee34..d851b9dfaa 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -129,6 +129,7 @@ def __bootstrap__():
         read_env_flags.append('rpc_send_thread_num')
         read_env_flags.append('rpc_get_thread_num')
         read_env_flags.append('rpc_prefetch_thread_num')
+        read_env_flags.append('rpc_disable_reuse_port')
 
     if core.is_compiled_with_cuda():
         read_env_flags += [

From a7188d5bc7a8990ce228902e0361ae7880bb1927 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Sun, 25 Nov 2018 17:01:49 +0800
Subject: [PATCH 31/36] fix executor transfer cache bug (#14518)

---
 paddle/fluid/framework/CMakeLists.txt         |  8 ++-
 paddle/fluid/framework/executor.cc            |  1 +
 paddle/fluid/framework/naive_executor.cc      |  1 +
 paddle/fluid/framework/operator.cc            | 49 +++++--------
 paddle/fluid/framework/operator.h             |  4 ++
 .../fluid/framework/transfer_scope_cache.cc   | 72 +++++++++++++++++++
 paddle/fluid/framework/transfer_scope_cache.h | 41 +++++++++++
 7 files changed, 143 insertions(+), 33 deletions(-)
 create mode 100644 paddle/fluid/framework/transfer_scope_cache.cc
 create mode 100644 paddle/fluid/framework/transfer_scope_cache.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 281d073166..83c8478685 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -116,8 +116,14 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 
+if (NOT WIN32)
+cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache)
+else()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor profiler)
+    shape_inference data_transform lod_tensor)
+endif(NOT WIN32)
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 7ce08b728d..f6c82995e1 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index e8e53f988f..e829563952 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -83,6 +83,7 @@ void NaiveExecutor::Run() {
   for (auto &op : ops_) {
     VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
             << " on scope " << scope_;
+    op->SetIsCalledByExecutor(false);
     op->Run(*scope_, place_);
   }
 }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 1ec170b6f6..0084573cd0 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -33,11 +34,6 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
 
-// Combine two hash values to a single hash.
-inline size_t CombineHash(size_t seed, size_t a) {
-  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
     std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -797,17 +793,6 @@ void OperatorWithKernel::TransferInplaceVarsBack(
 Scope* OperatorWithKernel::TryTransferData(
     const Scope& scope, const OpKernelType& expected_kernel_key,
     std::vector<std::string>* transfered_inplace_vars) const {
-// In the inference scenerio, the scopes will be reused across the batches, so
-// the `new_scope` here will result in GPU memroy explosion over the running of
-// operators.
-// We use a thread_local cache to fix that issue, the key in the cache is the
-// combination of the `scope` argument, from_kernel_type, target_kernel_type.
-// Have a discussion with @Superjomn or the inference developers if some changes
-// on this logic for this macro might not tested on the other scenerios.
-#ifdef PADDLE_ON_INFERENCE
-  thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
-#endif
-
   Scope* new_scope = nullptr;
   for (auto& var_name_item : Inputs()) {
     for (auto& var_name : var_name_item.second) {
@@ -838,23 +823,23 @@ Scope* OperatorWithKernel::TryTransferData(
       VLOG(30) << "Transform Variable " << var_name << " from "
                << kernel_type_for_var << " to " << expected_kernel_key;
 
-#ifdef PADDLE_ON_INFERENCE
-      size_t infer_cache_key =
-          CombineHash(OpKernelType::Hash()(kernel_type_for_var),
-                      OpKernelType::Hash()(expected_kernel_key));
-      infer_cache_key =
-          CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
-
-      auto it = infer_transfer_scope_cache.find(infer_cache_key);
-      if (it != infer_transfer_scope_cache.end()) {
-        new_scope = infer_transfer_scope_cache[infer_cache_key];
-      } else {
-        new_scope = &scope.NewScope();
-        infer_transfer_scope_cache[infer_cache_key] = new_scope;
+      // In the inference scenerio, the scopes will be reused across the
+      // batches, so the `new_scope` here will result in GPU memroy explosion
+      // over the  running of operators.
+      // We use a thread_local cache to fix that issue, the key in the cache is
+      // the combination of the `scope` argument, from_kernel_type,
+      // target_kernel_type.
+      // Have a discussion with @Superjomn or the inference developers if some
+      // changes on this logic for this macro might not tested on the other
+      // scenerios.
+      // If this op is not called by an Executor or ParallelExecutor, it should
+      // called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
+      // variables, that behavior a lot different.
+      if (!run_by_executor_) {
+        new_scope = TryCreateTransferScope(kernel_type_for_var,
+                                           expected_kernel_key, &scope);
       }
-#endif
-
-      if (new_scope == nullptr) {
+      if (!new_scope) {
         new_scope = &scope.NewScope();
       }
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index ef83833217..bfdfdc56b3 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -127,6 +127,8 @@ class OperatorBase {
   //! Get all outputs variable names
   virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
 
+  void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
+
  protected:
   std::string type_;
   // NOTE: in case of OpGrad, inputs_ contains:
@@ -139,6 +141,8 @@ class OperatorBase {
   // IG (Inputs Gradients)
   VariableNameMap outputs_;
   AttributeMap attrs_;
+  // Whether this operator executes in an Executor.
+  bool run_by_executor_{true};
 
  private:
   void GenerateTemporaryNames();
diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc
new file mode 100644
index 0000000000..e52a8317e2
--- /dev/null
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/transfer_scope_cache.h"
+
+namespace paddle {
+namespace framework {
+
+std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
+  thread_local auto* x = new std::unordered_map<size_t, Scope*>;
+  return *x;
+}
+
+std::unordered_set<Scope*>& global_transfer_scope_cache() {
+  thread_local auto* x = new std::unordered_set<Scope*>;
+  return *x;
+}
+
+Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
+                              const Scope* scope) {
+  Scope* new_scope{nullptr};
+  size_t infer_cache_key =
+      CombineHash(OpKernelType::Hash()(type0), OpKernelType::Hash()(type1));
+  infer_cache_key =
+      CombineHash(infer_cache_key, std::hash<const Scope*>()(scope));
+
+  auto it = global_transfer_data_cache().find(infer_cache_key);
+  if (it != global_transfer_data_cache().end()) {
+    new_scope = global_transfer_data_cache()[infer_cache_key];
+  } else {
+    new_scope = &scope->NewScope();
+    global_transfer_data_cache()[infer_cache_key] = new_scope;
+  }
+  global_transfer_scope_cache().insert(new_scope);
+  return new_scope;
+}
+
+void RemoveKidsFromTransferScopeCache(Scope* scope) {
+  auto it = global_transfer_scope_cache().find(scope);
+  if (it != global_transfer_scope_cache().end()) {
+    global_transfer_scope_cache().erase(it);
+  }
+  for (auto* s : scope->kids()) {
+    auto it = global_transfer_scope_cache().find(s);
+    if (it != global_transfer_scope_cache().end()) {
+      global_transfer_scope_cache().erase(it);
+    }
+  }
+
+  // remove global transfer data cache
+  auto& cache = global_transfer_data_cache();
+  for (auto it = cache.begin(); it != cache.end();) {
+    if (it->second == scope)
+      it = cache.erase(it);
+    else
+      it++;
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/transfer_scope_cache.h b/paddle/fluid/framework/transfer_scope_cache.h
new file mode 100644
index 0000000000..86fc0bf529
--- /dev/null
+++ b/paddle/fluid/framework/transfer_scope_cache.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <thread>  // NOLINT
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+std::unordered_map<size_t, Scope*>& global_transfer_data_cache();
+
+std::unordered_set<Scope*>& global_transfer_scope_cache();
+
+// Combine two hash values to a single hash.
+static size_t CombineHash(size_t seed, size_t a) {
+  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
+                              const Scope* scope);
+
+void RemoveKidsFromTransferScopeCache(Scope* scope);
+
+}  // namespace framework
+}  // namespace paddle

From c52f65e0711cf5bb061df9743a631e9e63694c68 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Sun, 25 Nov 2018 17:33:31 +0800
Subject: [PATCH 32/36] Fix the random fail in test_image_classification.py
 (#14551)

test=develop
---
 python/paddle/fluid/tests/book/test_image_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index cba486cf59..c91bd27895 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -239,7 +239,7 @@ def infer(use_cuda, save_dirname=None):
         assert len(results[0]) == len(transpiler_results[0])
         for i in range(len(results[0])):
             np.testing.assert_almost_equal(
-                results[0][i], transpiler_results[0][i], decimal=5)
+                results[0][i], transpiler_results[0][i], decimal=4)
 
         print("infer results: ", results[0])
 

From 923c8e33325690d304cf6f0e13f29a4ea1611544 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Sun, 25 Nov 2018 22:40:24 +0800
Subject: [PATCH 33/36] add benchmark for inference (#14571)

---
 paddle/fluid/inference/CMakeLists.txt         |  1 +
 paddle/fluid/inference/utils/CMakeLists.txt   |  2 +
 paddle/fluid/inference/utils/benchmark.cc     | 49 +++++++++++++++++
 paddle/fluid/inference/utils/benchmark.h      | 52 +++++++++++++++++++
 .../fluid/inference/utils/benchmark_tester.cc | 39 ++++++++++++++
 5 files changed, 143 insertions(+)
 create mode 100644 paddle/fluid/inference/utils/CMakeLists.txt
 create mode 100644 paddle/fluid/inference/utils/benchmark.cc
 create mode 100644 paddle/fluid/inference/utils/benchmark.h
 create mode 100644 paddle/fluid/inference/utils/benchmark_tester.cc

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 2c5364b724..058a5b5f46 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -4,6 +4,7 @@ endif()
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
 add_subdirectory(analysis)
+add_subdirectory(utils)
 if (TENSORRT_FOUND)
   add_subdirectory(tensorrt)
 endif()
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
new file mode 100644
index 0000000000..2104e4ac72
--- /dev/null
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(benchmark SRCS benchmark.cc DEPS enforce)
+cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc
new file mode 100644
index 0000000000..021edc2de5
--- /dev/null
+++ b/paddle/fluid/inference/utils/benchmark.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/utils/benchmark.h"
+#include <sstream>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+
+std::string Benchmark::SerializeToString() const {
+  std::stringstream ss;
+  ss << "-----------------------------------------------------\n";
+  ss << "name\t";
+  ss << "batch_size\t";
+  ss << "num_threads\t";
+  ss << "latency\t";
+  ss << "qps";
+  ss << '\n';
+
+  ss << name_ << "\t";
+  ss << batch_size_ << "\t";
+  ss << num_threads_ << "\t";
+  ss << latency_ << "\t";
+  ss << 1000 / latency_;
+  ss << '\n';
+  return ss.str();
+}
+void Benchmark::PersistToFile(const std::string &path) const {
+  std::ofstream file(path, std::ios::app);
+  PADDLE_ENFORCE(file.is_open(), "Can not open %s to add benchmark", path);
+  file << SerializeToString();
+  file.flush();
+  file.close();
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h
new file mode 100644
index 0000000000..80e8f77adb
--- /dev/null
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+
+namespace paddle {
+namespace inference {
+
+/*
+ * Helper class to calculate the performance.
+ */
+struct Benchmark {
+  int batch_size() const { return batch_size_; }
+  void SetBatchSize(int x) { batch_size_ = x; }
+
+  int num_threads() const { return num_threads_; }
+  void SetNumThreads(int x) { num_threads_ = x; }
+
+  bool use_gpu() const { return use_gpu_; }
+  void SetUseGpu() { use_gpu_ = true; }
+
+  int latency() const { return latency_; }
+  void SetLatency(int x) { latency_ = x; }
+
+  const std::string& name() const { return name_; }
+  void SetName(const std::string& name) { name_ = name; }
+
+  std::string SerializeToString() const;
+  void PersistToFile(const std::string& path) const;
+
+ private:
+  bool use_gpu_{false};
+  int batch_size_{0};
+  int latency_;
+  int num_threads_{1};
+  std::string name_;
+};
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc
new file mode 100644
index 0000000000..eb25547408
--- /dev/null
+++ b/paddle/fluid/inference/utils/benchmark_tester.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/utils/benchmark.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+using namespace paddle::inference;
+TEST(Benchmark, basic) {
+  Benchmark benchmark;
+  benchmark.SetName("key0");
+  benchmark.SetBatchSize(10);
+  benchmark.SetUseGpu();
+  benchmark.SetLatency(220);
+  LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString();
+}
+
+TEST(Benchmark, PersistToFile) {
+  Benchmark benchmark;
+  benchmark.SetName("key0");
+  benchmark.SetBatchSize(10);
+  benchmark.SetUseGpu();
+  benchmark.SetLatency(220);
+
+  benchmark.PersistToFile("1.log");
+  benchmark.PersistToFile("1.log");
+  benchmark.PersistToFile("1.log");
+}
\ No newline at end of file

From 840c1b29adb1168f8ff84d1151462b0d171c741b Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Mon, 26 Nov 2018 10:30:25 +0800
Subject: [PATCH 34/36] test=develop (#14562)

* test=develop

remove code.

* test=develop
---
 paddle/fluid/inference/api/demo_ci/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 49683eab07..8fb464c0f5 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -46,8 +46,6 @@ if(WITH_GPU)
     endif()
   endif(NOT WIN32)
 endif()
-
-include_directories("D:/Paddle/")
 include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")

From bf222f197d0ac5f7416d5e70d5ddca1009b96559 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Mon, 26 Nov 2018 11:21:26 +0800
Subject: [PATCH 35/36] Use sub scope in tensor_array_to_tensor op. (#14524)

test=develop
---
 paddle/fluid/framework/executor.cc            |  4 +-
 .../fluid/inference/api/analysis_predictor.cc |  1 -
 paddle/fluid/inference/api/api_impl.cc        |  1 -
 .../fluid/inference/tests/api/CMakeLists.txt  | 50 +++++++++----------
 .../operators/tensor_array_to_tensor_op.cc    | 10 ++--
 5 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index f6c82995e1..3dc571d757 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -392,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector<Tensor>> gc;
-  // WhileOp would set keep_kids to false
-  // WhileGradOp would need the scopes created in WhileOp
+  // WhileOp would set keep_kids to true,
+  // because WhileGradOp needs the scopes created in WhileOp.
   // Perhaps, we should not perform eager deletion in WhileOp
   // The scopes and variables created by WhileOp would be deleted
   // in WhileGradOp.
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index c132ce326c..d111c0cfd9 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -174,7 +174,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   inference::Timer timer;
   timer.tic();
   // set feed variable
-  std::vector<framework::LoDTensor> feeds;
   framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
   if (!SetFeed(inputs, scope)) {
     LOG(ERROR) << "fail to set feed";
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 66a8e51396..ff375c73a7 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -138,7 +138,6 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   Timer timer;
   timer.tic();
   // set feed variable
-  std::vector<framework::LoDTensor> feeds;
   framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
   if (!SetFeed(inputs, scope)) {
     LOG(ERROR) << "fail to set feed";
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index e8bd13037e..7dc88d9dd0 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -74,7 +74,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
-  inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
 
@@ -88,31 +88,31 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
 
 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
-   # anakin rnn1
-   set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
-   set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
-   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
-   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
-   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
-           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
-                --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
-           DEPS inference_anakin_api_shared SERIAL)
-   # anakin mobilenet
-   if(WITH_GPU)
-       set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
-       inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
-       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
-               ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
-               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
-   endif()
+    # anakin rnn1
+    set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
+    set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
+    inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
+    inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
+    cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
+            ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
+                 --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
+            DEPS inference_anakin_api_shared SERIAL)
+    # anakin mobilenet
+    if(WITH_GPU)
+        set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
+        inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
+        cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
+                ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+    endif()
 endif()
 
 if(WITH_GPU AND TENSORRT_FOUND)
-   set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
-   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
-       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
-   endif()
-   inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
-      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
+    set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
+    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
+    endif()
+    inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 96dc123f6a..58a74ec2c1 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -106,9 +106,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
     out_inx_dim[0] = inx.size();
     out_inx.Resize(out_inx_dim);
 
+    auto &local_scope = scope.NewScope();
     std::string var_name = "out_index";
-    framework::Variable *tmp_index_var =
-        const_cast<framework::Scope &>(scope).Var(var_name);
+    framework::Variable *tmp_index_var = local_scope.Var(var_name);
     auto &tmp_index_tensor =
         *(tmp_index_var->GetMutable<paddle::framework::LoDTensor>());
     tmp_index_tensor.Resize(out_inx_dim);
@@ -128,12 +128,12 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
     out_dims[axis] = out_dim_sum;
     out.Resize(out_dims);
 
-    LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
-    // Invoke Reshape Op
+    LodTensorArray2LodTensorVector(local_scope, base_name, Input("X"), &names);
+    // Invoke concat Op
     auto concat_op = framework::OpRegistry::CreateOp(
         "concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs);
 
-    concat_op->Run(scope, place);
+    concat_op->Run(local_scope, place);
   }
 };
 

From 39ec80def42ef48112ee0025e2a318138f1992b7 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Mon, 26 Nov 2018 12:49:17 +0800
Subject: [PATCH 36/36] Remove the memory copy of feeding data in C++ inference
 API (#14577)

* Remove the memory copy for feeding data in C++ inference API
* Fix compling dependence
* Fix compling in ONLY_CPU mode
---
 paddle/fluid/inference/api/CMakeLists.txt     |  4 +++-
 .../fluid/inference/api/analysis_predictor.cc | 23 ++++++++++++++----
 paddle/fluid/inference/api/api_impl.cc        | 24 +++++++++++++++----
 3 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index e9969b84f3..eda251c534 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
+           lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
+           analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array)
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d111c0cfd9..72ac534384 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -31,6 +31,7 @@
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #endif
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -214,17 +215,29 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     framework::DDim ddim = framework::make_ddim(inputs[i].shape);
     void *input_ptr;
     if (inputs[i].dtype == PaddleDType::INT64) {
-      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<int64_t>(ddim, place_);
     } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
-      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<float>(ddim, place_);
     } else {
       LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
       return false;
     }
 
-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
-                inputs[i].data.length());
+    if (platform::is_cpu_place(place_)) {
+      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                  inputs[i].data.length());
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
+      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
+                   platform::CPUPlace(), inputs[i].data.data(),
+                   inputs[i].data.length(),
+                   0);  // stream 0 for sync copy
+#else
+      PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+    }
     // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
     framework::LoD lod;
     for (auto &level : inputs[i].lod) {
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index ff375c73a7..0f88ad14b0 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -193,17 +194,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     framework::DDim ddim = framework::make_ddim(inputs[i].shape);
     void *input_ptr;
     if (inputs[i].dtype == PaddleDType::INT64) {
-      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<int64_t>(ddim, place_);
     } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
-      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+      input_ptr = input.mutable_data<float>(ddim, place_);
     } else {
       LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
       return false;
     }
 
-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
-                inputs[i].data.length());
+    if (platform::is_cpu_place(place_)) {
+      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                  inputs[i].data.length());
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
+      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
+                   platform::CPUPlace(), inputs[i].data.data(),
+                   inputs[i].data.length(),
+                   0);  // stream 0 for sync copy
+#else
+      PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+    }
+
     // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
     framework::LoD lod;
     for (auto &level : inputs[i].lod) {